# HG changeset patch # User bgruening # Date 1494091089 14400 # Node ID fcadce4d9a0601f646e1220a0f9e7f7117a58047 # Parent 0f8646f22b8dfc345965eb6e3922fd6d44312e1c planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n' diff -r 0f8646f22b8d -r fcadce4d9a06 bismark --- a/bismark Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7959 +0,0 @@ -#!/usr/bin/perl -- -use strict; -use warnings; -use IO::Handle; -use Cwd; -$|++; -use Getopt::Long; - - -## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) - -## This program is free software: you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation, either version 3 of the License, or -## (at your option) any later version. - -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. - -## You should have received a copy of the GNU General Public License -## along with this program. If not, see . - - -my $parent_dir = getcwd; -my $bismark_version = 'v0.10.0'; -my $command_line = join (" ",@ARGV); - -### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail -foreach my $arg (@ARGV){ - if ($arg eq '--solexa1.3-quals'){ - $arg = '--phred64-quals'; - } -} -my @filenames; # will be populated by processing the command line - -my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line(); - -my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment -my %chromosomes; # stores the chromosome sequences of the mouse genome -my %counting; # counting various events - -my $seqID_contains_tabs; - -foreach my $filename (@filenames){ - - chdir $parent_dir or die "Unable to move to initial working directory $!\n"; - ### resetting the counting hash and fhs - reset_counters_and_fhs($filename); - $seqID_contains_tabs = 0; - - ### PAIRED-END ALIGNMENTS - if ($filename =~ ','){ - my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file - - $fhs[0]->{name} = 'CTread1GAread2CTgenome'; - $fhs[1]->{name} = 'GAread1CTread2GAgenome'; - $fhs[2]->{name} = 'GAread1CTread2CTgenome'; - $fhs[3]->{name} = 'CTread1GAread2GAgenome'; - - warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n"; - - my ($filename_1,$filename_2) = (split (/,/,$filename)); - warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n"; - - ### additional variables only for paired-end alignments - my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file - - ### FastA format - if ($sequence_file_format eq 'FASTA'){ - warn "Input files are in FastA format\n"; - - if ($directional){ - ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number - ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); - - $fhs[0]->{inputfile_1} = $C_to_T_infile_1; - $fhs[0]->{inputfile_2} = $G_to_A_infile_2; - $fhs[1]->{inputfile_1} = undef; - $fhs[1]->{inputfile_2} = undef; - $fhs[2]->{inputfile_1} = undef; - $fhs[2]->{inputfile_2} = undef; - $fhs[3]->{inputfile_1} = $C_to_T_infile_1; - $fhs[3]->{inputfile_2} = $G_to_A_infile_2; - } - else{ - ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number - ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); - - $fhs[0]->{inputfile_1} = $C_to_T_infile_1; - $fhs[0]->{inputfile_2} = $G_to_A_infile_2; - $fhs[1]->{inputfile_1} = $G_to_A_infile_1; - $fhs[1]->{inputfile_2} = $C_to_T_infile_2; - $fhs[2]->{inputfile_1} = $G_to_A_infile_1; - $fhs[2]->{inputfile_2} = $C_to_T_infile_2; - $fhs[3]->{inputfile_1} = $C_to_T_infile_1; - $fhs[3]->{inputfile_2} = $G_to_A_infile_2; - } - - if ($bowtie2){ - paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); - } - else{ - paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); - } - } - - ### FastQ format - else{ - warn "Input files are in FastQ format\n"; - if ($directional){ - if ($bowtie2){ - ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number - ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); - - $fhs[0]->{inputfile_1} = $C_to_T_infile_1; - $fhs[0]->{inputfile_2} = $G_to_A_infile_2; - $fhs[1]->{inputfile_1} = undef; - $fhs[1]->{inputfile_2} = undef; - $fhs[2]->{inputfile_1} = undef; - $fhs[2]->{inputfile_2} = undef; - $fhs[3]->{inputfile_1} = $C_to_T_infile_1; - $fhs[3]->{inputfile_2} = $G_to_A_infile_2; - } - else{ # Bowtie 1 alignments - if ($gzip){ - ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time - - $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format - $fhs[0]->{inputfile_2} = undef; # no longer needed - $fhs[1]->{inputfile_1} = undef; - $fhs[1]->{inputfile_2} = undef; - $fhs[2]->{inputfile_1} = undef; - $fhs[2]->{inputfile_2} = undef; - $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format - $fhs[3]->{inputfile_2} = undef; # no longer needed - } - else{ - ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number - ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); - - $fhs[0]->{inputfile_1} = $C_to_T_infile_1; - $fhs[0]->{inputfile_2} = $G_to_A_infile_2; - $fhs[1]->{inputfile_1} = undef; - $fhs[1]->{inputfile_2} = undef; - $fhs[2]->{inputfile_1} = undef; - $fhs[2]->{inputfile_2} = undef; - $fhs[3]->{inputfile_1} = $C_to_T_infile_1; - $fhs[3]->{inputfile_2} = $G_to_A_infile_2; - } - } - } - elsif($pbat){ # PBAT-Seq - ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1 - ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number - ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); - - $fhs[0]->{inputfile_1} = undef; - $fhs[0]->{inputfile_2} = undef; - $fhs[1]->{inputfile_1} = $G_to_A_infile_1; - $fhs[1]->{inputfile_2} = $C_to_T_infile_2; - $fhs[2]->{inputfile_1} = $G_to_A_infile_1; - $fhs[2]->{inputfile_2} = $C_to_T_infile_2; - $fhs[3]->{inputfile_1} = undef; - $fhs[3]->{inputfile_2} = undef; - } - else{ - if ($bowtie2){ - ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number - ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); - - $fhs[0]->{inputfile_1} = $C_to_T_infile_1; - $fhs[0]->{inputfile_2} = $G_to_A_infile_2; - $fhs[1]->{inputfile_1} = $G_to_A_infile_1; - $fhs[1]->{inputfile_2} = $C_to_T_infile_2; - $fhs[2]->{inputfile_1} = $G_to_A_infile_1; - $fhs[2]->{inputfile_2} = $C_to_T_infile_2; - $fhs[3]->{inputfile_1} = $C_to_T_infile_1; - $fhs[3]->{inputfile_2} = $G_to_A_infile_2; - } - else{ # Bowtie 1 alignments - if ($gzip){ - ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time - - $fhs[0]->{inputfile_1} = $C_to_T_infile_1; - $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files - $fhs[1]->{inputfile_1} = $G_to_A_infile_1; - $fhs[1]->{inputfile_2} = undef; - $fhs[2]->{inputfile_1} = $G_to_A_infile_1; - $fhs[2]->{inputfile_2} = undef; - $fhs[3]->{inputfile_1} = $C_to_T_infile_1; - $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files - } - else{ #uncompressed temp files - ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number - ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); - - $fhs[0]->{inputfile_1} = $C_to_T_infile_1; - $fhs[0]->{inputfile_2} = $G_to_A_infile_2; - $fhs[1]->{inputfile_1} = $G_to_A_infile_1; - $fhs[1]->{inputfile_2} = $C_to_T_infile_2; - $fhs[2]->{inputfile_1} = $G_to_A_infile_1; - $fhs[2]->{inputfile_2} = $C_to_T_infile_2; - $fhs[3]->{inputfile_1} = $C_to_T_infile_1; - $fhs[3]->{inputfile_2} = $G_to_A_infile_2; - } - } - } - if ($bowtie2){ - paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); - } - else{ - paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); - } - } - start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); - } - - ### Else we are performing SINGLE-END ALIGNMENTS - else{ - warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n"; - ### Initialising bisulfite conversion filenames - my ($C_to_T_infile,$G_to_A_infile); - - - ### FastA format - if ($sequence_file_format eq 'FASTA'){ - warn "Inut file is in FastA format\n"; - if ($directional){ - ($C_to_T_infile) = biTransformFastAFiles ($filename); - $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; - } - else{ - ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename); - $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; - $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; - } - - ### Creating 4 different bowtie filehandles and storing the first entry - if ($bowtie2){ - single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile); - } - else{ - single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile); - } - } - - ## FastQ format - else{ - warn "Input file is in FastQ format\n"; - if ($directional){ - ($C_to_T_infile) = biTransformFastQFiles ($filename); - $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; - } - elsif($pbat){ - ($G_to_A_infile) = biTransformFastQFiles ($filename); - $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files - } - else{ - ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename); - $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; - $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; - } - - ### Creating up to 4 different bowtie filehandles and storing the first entry - if ($bowtie2){ - single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile); - } - elsif ($pbat){ - single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile); - } - else{ - single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile); - } - } - - start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile); - - } -} - -sub start_methylation_call_procedure_single_ends { - my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; - my ($dir,$filename); - - if ($sequence_file =~ /\//){ - ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/; - } - else{ - $filename = $sequence_file; - } - - ### printing all alignments to a results file - my $outfile = $filename; - if ($prefix){ - $outfile = "$prefix.$outfile"; - } - - - if ($bowtie2){ # SAM format is the default for Bowtie 2 - $outfile =~ s/$/_bismark_bt2.sam/; - } - elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) - $outfile =~ s/$/_bismark.txt/; - } - else{ # SAM is the default output - $outfile =~ s/$/_bismark.sam/; - } - - $bam = 0 unless (defined $bam); - - if ($bam == 1){ ### Samtools is installed, writing out BAM directly - $outfile =~ s/sam/bam/; - open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; - } - elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead - $outfile .= '.gz'; - open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; - } - else{ # uncompressed ouput, default - open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; - } - - warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n"; - sleep(1); - - if ($vanilla){ - print OUT "Bismark version: $bismark_version\n"; - } - - ### printing alignment and methylation call summary to a report file - my $reportfile = $filename; - if ($prefix){ - $reportfile = "$prefix.$reportfile"; - } - - if ($bowtie2){ - $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; - } - else{ - $reportfile =~ s/$/_bismark_SE_report.txt/; - } - - open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; - print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; - - if ($unmapped){ - my $unmapped_file = $filename; - if ($prefix){ - $unmapped_file = "$prefix.$unmapped_file"; - } - - $unmapped_file =~ s/$/_unmapped_reads.txt/; - open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; - print "Unmapped sequences will be written to $output_dir$unmapped_file\n"; - } - if ($ambiguous){ - my $ambiguous_file = $filename; - if ($prefix){ - $ambiguous_file = "$prefix.$ambiguous_file"; - } - $ambiguous_file =~ s/$/_ambiguous_reads.txt/; - open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; - print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; - } - - if ($directional){ - print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n"; - } - print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - - - ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time - unless (%chromosomes){ - my $cwd = getcwd; # storing the path of the current working directory - print "Current working directory is: $cwd\n\n"; - read_genome_into_memory($cwd); - } - - unless ($vanilla or $sam_no_hd){ - generate_SAM_header(); - } - - ### Input file is in FastA format - if ($sequence_file_format eq 'FASTA'){ - process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); - } - ### Input file is in FastQ format - else{ - process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); - } -} - -sub start_methylation_call_procedure_paired_ends { - my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; - - my ($dir_1,$filename_1); - - if ($sequence_file_1 =~ /\//){ - ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/; - } - else{ - $filename_1 = $sequence_file_1; - } - - my ($dir_2,$filename_2); - - if ($sequence_file_2 =~ /\//){ - ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/; - } - else{ - $filename_2 = $sequence_file_2; - } - - ### printing all alignments to a results file - my $outfile = $filename_1; - - if ($prefix){ - $outfile = "$prefix.$outfile"; - } - - if ($bowtie2){ # SAM format is the default Bowtie 2 output - $outfile =~ s/$/_bismark_bt2_pe.sam/; - } - elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) - $outfile =~ s/$/_bismark_pe.txt/; - } - else{ # SAM format is the default Bowtie 1 output - $outfile =~ s/$/_bismark_pe.sam/; - } - - $bam = 0 unless (defined $bam); - - if ($bam == 1){ ### Samtools is installed, writing out BAM directly - $outfile =~ s/sam/bam/; - open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; - } - elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead - $outfile .= '.gz'; - open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; - } - else{ # uncompressed ouput, default - open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; - } - - warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n"; - sleep(1); - - if ($vanilla){ - print OUT "Bismark version: $bismark_version\n"; - } - - ### printing alignment and methylation call summary to a report file - my $reportfile = $filename_1; - if ($prefix){ - $reportfile = "$prefix.$reportfile"; - } - - if ($bowtie2){ - $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; - } - else{ - $reportfile =~ s/$/_bismark_PE_report.txt/; - } - - open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; - print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n"; - print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - - - ### Unmapped read output - if ($unmapped){ - my $unmapped_1 = $filename_1; - my $unmapped_2 = $filename_2; - if ($prefix){ - $unmapped_1 = "$prefix.$unmapped_1"; - $unmapped_2 = "$prefix.$unmapped_2"; - } - $unmapped_1 =~ s/$/_unmapped_reads_1.txt/; - $unmapped_2 =~ s/$/_unmapped_reads_2.txt/; - open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; - open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; - print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; - } - - if ($ambiguous){ - my $amb_1 = $filename_1; - my $amb_2 = $filename_2; - if ($prefix){ - $amb_1 = "$prefix.$amb_1"; - $amb_2 = "$prefix.$amb_2"; - } - - $amb_1 =~ s/$/_ambiguous_reads_1.txt/; - $amb_2 =~ s/$/_ambiguous_reads_2.txt/; - open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; - open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; - print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; - } - - if ($directional){ - print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n"; - } - - ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time - unless (%chromosomes){ - my $cwd = getcwd; # storing the path of the current working directory - print "Current working directory is: $cwd\n\n"; - read_genome_into_memory($cwd); - } - - unless ($vanilla or $sam_no_hd){ - generate_SAM_header(); - } - - ### Input files are in FastA format - if ($sequence_file_format eq 'FASTA'){ - process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); - } - ### Input files are in FastQ format - else{ - process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); - } -} - -sub print_final_analysis_report_single_end{ - my ($C_to_T_infile,$G_to_A_infile) = @_; - ### All sequences from the original sequence file have been analysed now - ### deleting temporary C->T or G->A infiles - - if ($directional){ - my $deletion_successful = unlink "$temp_dir$C_to_T_infile"; - if ($deletion_successful == 1){ - warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n"; - } - else{ - warn "Could not delete temporary file $C_to_T_infile properly $!\n"; - } - } - elsif ($pbat){ - my $deletion_successful = unlink "$temp_dir$G_to_A_infile"; - if ($deletion_successful == 1){ - warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n"; - } - else{ - warn "Could not delete temporary file $G_to_A_infile properly $!\n"; - } - } - else{ - my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile"; - if ($deletion_successful == 2){ - warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n"; - } - else{ - warn "Could not delete temporary files properly $!\n"; - } - } - - ### printing a final report for the alignment procedure - print REPORT "Final Alignment report\n",'='x22,"\n"; - warn "Final Alignment report\n",'='x22,"\n"; - # foreach my $index (0..$#fhs){ - # print "$fhs[$index]->{name}\n"; - # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; - # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; - # } - - ### printing a final report for the methylation call procedure - warn "Sequences analysed in total:\t$counting{sequences_count}\n"; - print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n"; - my $percent_alignable_sequences; - - if ($counting{sequences_count} == 0){ - $percent_alignable_sequences = 0; - } - else{ - $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); - } - - warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n"; - print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n"; - - ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads), - ### only calculating the percentage if there were any overruled alignments - if ($counting{low_complexity_alignments_overruled_count}){ - my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count}); - # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n"; - } - - print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; - print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; - print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; - print "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; - print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; - - print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; - print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; - print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; - print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; - print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; - - if ($directional){ - print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; - print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; - } - - ### detailed information about Cs analysed - warn "Final Cytosine Methylation Report\n",'='x33,"\n"; - my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; - warn "Total number of C's analysed:\t$total_number_of_C\n\n"; - warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; - warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; - if ($bowtie2){ - warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; - } - warn "\n"; - - warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; - if ($bowtie2){ - warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; - } - warn "\n"; - - print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; - print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; - - print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; - print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; - if ($bowtie2){ - print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; - } - print REPORT "\n"; - - print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; - if ($bowtie2){ - print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; - } - print REPORT "\n"; - - my $percent_meCHG; - if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ - $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); - } - - my $percent_meCHH; - if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ - $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); - } - - my $percent_meCpG; - if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ - $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); - } - - my $percent_meC_unknown; - if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ - $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); - } - - - ### printing methylated CpG percentage if applicable - if ($percent_meCpG){ - warn "C methylated in CpG context:\t${percent_meCpG}%\n"; - print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; - } - else{ - warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; - print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; - } - - ### printing methylated C percentage (CHG context) if applicable - if ($percent_meCHG){ - warn "C methylated in CHG context:\t${percent_meCHG}%\n"; - print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; - } - else{ - warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; - print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; - } - - ### printing methylated C percentage (CHH context) if applicable - if ($percent_meCHH){ - warn "C methylated in CHH context:\t${percent_meCHH}%\n"; - print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; - } - else{ - warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; - print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; - } - - ### printing methylated C percentage (Unknown C context) if applicable - if ($bowtie2){ - if ($percent_meC_unknown){ - warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; - print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; - } - else{ - warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; - print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; - } - } - print REPORT "\n\n"; - warn "\n\n"; - - if ($seqID_contains_tabs){ - warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; - print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; - } - - - ########################################################################################################################################### - ### create pie-chart with mapping stats - ########################################################################################################################################### - - - my $filename; - if ($pbat){ - $filename = $G_to_A_infile; - } - else{ - $filename = $C_to_T_infile; - } - - my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified - $pie_chart =~ s/gz$//; - $pie_chart =~ s/_C_to_T\.fastq$//; - $pie_chart =~ s/_G_to_A\.fastq$//; - - # if ($prefix){ - # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation - # } - $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png"; - - - #Check whether the module GD::Graph is installed - my $gd_graph_installed = 0; - eval{ - require GD::Graph::pie; - GD::Graph::pie->import(); - }; - - unless($@) { - $gd_graph_installed = 1; - } - else{ - warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; - sleep(2); - } - - if ($gd_graph_installed){ - warn "Generating pie chart\n\n"; - sleep(1); - my $graph = GD::Graph::pie->new(600,600); - - my $percent_unaligned; - my $percent_multiple; - my $percent_unextractable; - - if ($counting{sequences_count}){ - $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); - $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); - $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); - } - else{ - $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; - } - - my @aln_stats = ( - ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], - [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], - ); - - $graph->set( - start_angle => 180, - '3d' => 0, - label => 'Alignment stats (single-end)', - suppress_angle => 2, # Only label slices of sufficient size - transparent => 0, - dclrs => [ qw(red lorange dgreen cyan) ], - ) or die $graph->error; - - my $gd = $graph->plot(\@aln_stats) or die $graph->error; - - open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; - binmode PIE; - print PIE $gd->png; - } - - warn "====================\nBismark run complete\n====================\n\n"; - -} - - -sub print_final_analysis_report_paired_ends{ - my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; - ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles - if ($directional){ - if ($G_to_A_infile_2){ - my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2"; - if ($deletion_successful == 2){ - warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n"; - } - else{ - warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n"; - } - } - else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete - my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1"; - if ($deletion_successful == 1){ - warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n"; - } - else{ - warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n"; - } - } - } - else{ - if ($G_to_A_infile_2 and $C_to_T_infile_2){ - my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2"; - if ($deletion_successful == 4){ - warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n"; - } - else{ - warn "Could not delete temporary files properly: $!\n"; - } - } - else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete - my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1"; - if ($deletion_successful == 2){ - warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n"; - } - else{ - warn "Could not delete temporary files properly: $!\n"; - } - } - } - - ### printing a final report for the alignment procedure - warn "Final Alignment report\n",'='x22,"\n"; - print REPORT "Final Alignment report\n",'='x22,"\n"; - # foreach my $index (0..$#fhs){ - # print "$fhs[$index]->{name}\n"; - # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; - # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; - # } - - ### printing a final report for the methylation call procedure - warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; - print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; - - my $percent_alignable_sequence_pairs; - if ($counting{sequences_count} == 0){ - $percent_alignable_sequence_pairs = 0; - } - else{ - $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); - } - print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n"; - print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n"; - - print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; - print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; - print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; - print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; - print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; - - - print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; - print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; - print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; - print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; - print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; - ### detailed information about Cs analysed - - if ($directional){ - print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; - print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; - } - - warn "Final Cytosine Methylation Report\n",'='x33,"\n"; - print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; - - my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; - warn "Total number of C's analysed:\t$total_number_of_C\n\n"; - warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; - warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; - if ($bowtie2){ - warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; - } - warn "\n"; - - warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; - if ($bowtie2){ - warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; - } - warn "\n"; - - print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; - print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; - print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; - if ($bowtie2){ - print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; - } - print REPORT "\n"; - - print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; - if ($bowtie2){ - print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; - } - print REPORT "\n"; - - my $percent_meCHG; - if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ - $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); - } - - my $percent_meCHH; - if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ - $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); - } - - my $percent_meCpG; - if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ - $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); - } - - my $percent_meC_unknown; - if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ - $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); - } - - - ### printing methylated CpG percentage if applicable - if ($percent_meCpG){ - warn "C methylated in CpG context:\t${percent_meCpG}%\n"; - print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; - } - else{ - warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; - print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; - } - - ### printing methylated C percentage in CHG context if applicable - if ($percent_meCHG){ - warn "C methylated in CHG context:\t${percent_meCHG}%\n"; - print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; - } - else{ - warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; - print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; - } - - ### printing methylated C percentage in CHH context if applicable - if ($percent_meCHH){ - warn "C methylated in CHH context:\t${percent_meCHH}%\n"; - print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; - } - else{ - warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; - print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; - } - - ### printing methylated C percentage (Unknown C context) if applicable - if ($bowtie2){ - if ($percent_meC_unknown){ - warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; - print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; - } - else{ - warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; - print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; - } - } - print REPORT "\n\n"; - warn "\n\n"; - - - ############################################################################################################################################ - ### create pie-chart with mapping stats - ########################################################################################################################################### - - my $filename; - if ($pbat){ - $filename = $G_to_A_infile_1; - } - else{ - $filename = $C_to_T_infile_1; - } - - my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified - $pie_chart =~ s/gz$//; - $pie_chart =~ s/_C_to_T.fastq$//; - $pie_chart =~ s/_G_to_A.fastq$//; - ### special format for gzipped PE Bowtie1 files - $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//; - $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//; - - if ($prefix){ - # prefix is now being prepended to the temp files already - # $pie_chart = "$prefix.$pie_chart"; - } - $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png"; - - #Check whether the module GD::Graph is installed - my $gd_graph_installed = 0; - eval{ - require GD::Graph::pie; - GD::Graph::pie->import(); - }; - - unless($@) { - $gd_graph_installed = 1; - } - else{ - warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; - sleep(2); - } - - if ($gd_graph_installed){ - warn "Generating pie chart\n\n"; - sleep(1); - my $graph = GD::Graph::pie->new(600,600); - - my $percent_unaligned; - my $percent_multiple; - my $percent_unextractable; - - if ($counting{sequences_count}){ - $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); - $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); - $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); - } - else{ - $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; - } - - my @aln_stats = ( - ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], - [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], - ); - - # push @{$mbias_read1[0]},$pos; - - $graph->set( - start_angle => 180, - '3d' => 0, - label => 'Alignment stats (paired-end)', - suppress_angle => 2, # Only label slices of sufficient size - transparent => 0, - dclrs => [ qw(red lorange dgreen cyan) ], - ) or die $graph->error; - - my $gd = $graph->plot(\@aln_stats) or die $graph->error; - - open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; - binmode PIE; - print PIE $gd->png; - } - - warn "====================\nBismark run complete\n====================\n\n"; - -} - -sub process_single_end_fastA_file_for_methylation_call{ - my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; - ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. - ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either - ### the C->T or G->A version - - ### gzipped version of the infile - if ($sequence_file =~ /\.gz$/){ - open (IN,"zcat $sequence_file |") or die $!; - } - else{ - open (IN,$sequence_file) or die $!; - } - - my $count = 0; - - warn "\nReading in the sequence file $sequence_file\n"; - while (1) { - # last if ($counting{sequences_count} > 100); - my $identifier = ; - my $sequence = ; - last unless ($identifier and $sequence); - - $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces - - ++$count; - - if ($skip){ - next unless ($count > $skip); - } - if ($upto){ - last if ($count > $upto); - } - - $counting{sequences_count}++; - if ($counting{sequences_count}%1000000==0) { - warn "Processed $counting{sequences_count} sequences so far\n"; - } - chomp $sequence; - chomp $identifier; - - $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers - - my $return; - if ($bowtie2){ - $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier); - } - else{ - $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1 - } - - unless ($return){ - $return = 0; - } - - # print the sequence to ambiguous.out if --ambiguous was specified - if ($ambiguous and $return == 2){ - print AMBIG ">$identifier\n"; - print AMBIG "$sequence\n"; - } - - # print the sequence to file if --un was specified - elsif ($unmapped and $return == 1){ - print UNMAPPED ">$identifier\n"; - print UNMAPPED "$sequence\n"; - } - } - print "Processed $counting{sequences_count} sequences in total\n\n"; - - print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); - -} - -sub process_single_end_fastQ_file_for_methylation_call{ - my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; - ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. - ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either - ### the C->T or G->A version - - ### gzipped version of the infile - if ($sequence_file =~ /\.gz$/){ - open (IN,"zcat $sequence_file |") or die $!; - } - else{ - open (IN,$sequence_file) or die $!; - } - - my $count = 0; - - warn "\nReading in the sequence file $sequence_file\n"; - while (1) { - my $identifier = ; - my $sequence = ; - my $identifier_2 = ; - my $quality_value = ; - last unless ($identifier and $sequence and $identifier_2 and $quality_value); - - $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces - - ++$count; - - if ($skip){ - next unless ($count > $skip); - } - if ($upto){ - last if ($count > $upto); - } - - $counting{sequences_count}++; - - if ($counting{sequences_count}%1000000==0) { - warn "Processed $counting{sequences_count} sequences so far\n"; - } - chomp $sequence; - chomp $identifier; - chomp $quality_value; - - $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers - - my $return; - if ($bowtie2){ - $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value); - } - else{ - $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1 - } - - unless ($return){ - $return = 0; - } - - # print the sequence to ambiguous.out if --ambiguous was specified - if ($ambiguous and $return == 2){ - print AMBIG "\@$identifier\n"; - print AMBIG "$sequence\n"; - print AMBIG $identifier_2; - print AMBIG "$quality_value\n"; - } - - # print the sequence to file if --un was specified - elsif ($unmapped and $return == 1){ - print UNMAPPED "\@$identifier\n"; - print UNMAPPED "$sequence\n"; - print UNMAPPED $identifier_2; - print UNMAPPED "$quality_value\n"; - } - } - print "Processed $counting{sequences_count} sequences in total\n\n"; - - print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); - -} - -sub process_fastA_files_for_paired_end_methylation_calls{ - my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; - ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to - ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping. - ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the - ### converted genomes (either the C->T or G->A version) - - ### gzipped version of the infiles - if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ - open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; - open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; - } - else{ - open (IN1,$sequence_file_1) or die $!; - open (IN2,$sequence_file_2) or die $!; - } - - warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; - ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one - - my $count = 0; - - while (1) { - # reading from the first input file - my $identifier_1 = ; - my $sequence_1 = ; - # reading from the second input file - my $identifier_2 = ; - my $sequence_2 = ; - last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2); - - $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces - $identifier_2 = fix_IDs($identifier_2); - - ++$count; - - if ($skip){ - next unless ($count > $skip); - } - if ($upto){ - last if ($count > $upto); - } - - $counting{sequences_count}++; - if ($counting{sequences_count}%1000000==0) { - warn "Processed $counting{sequences_count} sequence pairs so far\n"; - } - my $orig_identifier_1 = $identifier_1; - my $orig_identifier_2 = $identifier_2; - - chomp $sequence_1; - chomp $identifier_1; - chomp $sequence_2; - chomp $identifier_2; - - $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers - - my $return; - if ($bowtie2){ - $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1); - } - else{ - $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1); - } - - unless ($return){ - $return = 0; - } - - # print the sequences to ambiguous_1 and _2 if --ambiguous was specified - if ($ambiguous and $return == 2){ - print AMBIG_1 $orig_identifier_1; - print AMBIG_1 "$sequence_1\n"; - print AMBIG_2 $orig_identifier_2; - print AMBIG_2 "$sequence_2\n"; - } - - # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified - elsif ($unmapped and $return == 1){ - print UNMAPPED_1 $orig_identifier_1; - print UNMAPPED_1 "$sequence_1\n"; - print UNMAPPED_2 $orig_identifier_2; - print UNMAPPED_2 "$sequence_2\n"; - } - } - - warn "Processed $counting{sequences_count} sequences in total\n\n"; - - close OUT or die $!; - - print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); - -} - -sub process_fastQ_files_for_paired_end_methylation_calls{ - my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; - ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to - ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments. - ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both) - ### of the converted genomes (either C->T or G->A version) - - ### gzipped version of the infiles - if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ - open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; - open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; - } - else{ - open (IN1,$sequence_file_1) or die $!; - open (IN2,$sequence_file_2) or die $!; - } - - my $count = 0; - - warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; - ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one - while (1) { - # reading from the first input file - my $identifier_1 = ; - my $sequence_1 = ; - my $ident_1 = ; # not needed - my $quality_value_1 = ; # not needed - # reading from the second input file - my $identifier_2 = ; - my $sequence_2 = ; - my $ident_2 = ; # not needed - my $quality_value_2 = ; # not needed - last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2); - - $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces - $identifier_2 = fix_IDs($identifier_2); - - ++$count; - - if ($skip){ - next unless ($count > $skip); - } - if ($upto){ - last if ($count > $upto); - } - - $counting{sequences_count}++; - if ($counting{sequences_count}%1000000==0) { - warn "Processed $counting{sequences_count} sequence pairs so far\n"; - } - - my $orig_identifier_1 = $identifier_1; - my $orig_identifier_2 = $identifier_2; - - chomp $sequence_1; - chomp $identifier_1; - chomp $sequence_2; - chomp $identifier_2; - chomp $quality_value_1; - chomp $quality_value_2; - - $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID - - my $return; - if ($bowtie2){ - $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); - } - else{ - $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); - } - - unless ($return){ - $return = 0; - } - - # print the sequences to ambiguous_1 and _2 if --ambiguous was specified - if ($ambiguous and $return == 2){ - # seq_1 - print AMBIG_1 $orig_identifier_1; - print AMBIG_1 "$sequence_1\n"; - print AMBIG_1 $ident_1; - print AMBIG_1 "$quality_value_1\n"; - # seq_2 - print AMBIG_2 $orig_identifier_2; - print AMBIG_2 "$sequence_2\n"; - print AMBIG_2 $ident_2; - print AMBIG_2 "$quality_value_2\n"; - } - - # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified - elsif ($unmapped and $return == 1){ - # seq_1 - print UNMAPPED_1 $orig_identifier_1; - print UNMAPPED_1 "$sequence_1\n"; - print UNMAPPED_1 $ident_1; - print UNMAPPED_1 "$quality_value_1\n"; - # seq_2 - print UNMAPPED_2 $orig_identifier_2; - print UNMAPPED_2 "$sequence_2\n"; - print UNMAPPED_2 $ident_2; - print UNMAPPED_2 "$quality_value_2\n"; - } - } - - warn "Processed $counting{sequences_count} sequences in total\n\n"; - - close OUT or die $!; - - print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); - -} - -sub check_bowtie_results_single_end{ - my ($sequence,$identifier,$quality_value) = @_; - - unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout - $quality_value = 'I'x(length$sequence); - } - - my %mismatches = (); - ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome - foreach my $index (0..$#fhs){ - - ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) - next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); - ### if the sequence we are currently looking at produced an alignment we are doing various things with it - if ($fhs[$index]->{last_seq_id} eq $identifier) { - ############################################################### - ### STEP I Now processing the alignment stored in last_line ### - ############################################################### - my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier); - ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation - ### we only continue to extract useful information about this alignment if 1 was returned - if ($valid_alignment_found_1 == 1){ - ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself - ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) - my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; - - unless($mismatch_info){ - $mismatch_info = ''; - } - - chomp $mismatch_info; - my $chromosome; - if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ - $chromosome = $mapped_chromosome; - } - else{ - die "Chromosome number extraction failed for $mapped_chromosome\n"; - } - ### Now extracting the number of mismatches to the converted genome - my $number_of_mismatches; - if ($mismatch_info eq ''){ - $number_of_mismatches = 0; - } - elsif ($mismatch_info =~ /^\d/){ - my @mismatches = split (/,/,$mismatch_info); - $number_of_mismatches = scalar @mismatches; - } - else{ - die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n"; - } - ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table - my $alignment_location = join (":",$chromosome,$position); - ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse - ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same - ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index - ### number for the found alignment) - unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ - $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; - $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; - $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; - $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; - $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; - } - $number_of_mismatches = undef; - ################################################################################################################################################## - ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a - ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will - ### be returned as $valid_alignment_found and it will then be processed in the next round only. - ################################################################################################################################################## - my $newline = $fhs[$index]->{fh}-> getline(); - if ($newline){ - my ($seq_id) = split (/\t/,$newline); - $fhs[$index]->{last_seq_id} = $seq_id; - $fhs[$index]->{last_line} = $newline; - } - else { - # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line} = undef; - next; - } - my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier); - ### we only continue to extract useful information about this second alignment if 1 was returned - if ($valid_alignment_found_2 == 1){ - ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself - ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) - my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; - unless($mismatch_info){ - $mismatch_info = ''; - } - chomp $mismatch_info; - - my $chromosome; - if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ - $chromosome = $mapped_chromosome; - } - else{ - die "Chromosome number extraction failed for $mapped_chromosome\n"; - } - - ### Now extracting the number of mismatches to the converted genome - my $number_of_mismatches; - if ($mismatch_info eq ''){ - $number_of_mismatches = 0; - } - elsif ($mismatch_info =~ /^\d/){ - my @mismatches = split (/,/,$mismatch_info); - $number_of_mismatches = scalar @mismatches; - } - else{ - die "Something weird is going on with the mismatch field\n"; - } - ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table - ### extracting the chromosome number from the bowtie output (see above) - my $alignment_location = join (":",$chromosome,$position); - ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position - ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this - ### case we are not writing the same entry out a second time. - unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ - $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; - $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; - $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; - $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; - $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; - } - #################################################################################################################################### - #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ### - #################################################################################################################################### - $newline = $fhs[$index]->{fh}-> getline(); - if ($newline){ - my ($seq_id) = split (/\t/,$newline); - die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier); - $fhs[$index]->{last_seq_id} = $seq_id; - $fhs[$index]->{last_line} = $newline; - next; - } - else { - # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line} = undef; - next; - } - ### still within the 2nd sequence in correct orientation found - } - ### still withing the 1st sequence in correct orientation found - } - ### still within the if (last_seq_id eq identifier) condition - } - ### still within foreach index loop - } - ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file - unless(%mismatches){ - $counting{no_single_alignment_found}++; - if ($unmapped){ - return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified - } - else{ - return; - } - } - ####################################################################################################################################################### - ####################################################################################################################################################### - ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### - ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### - ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ### - ####################################################################################################################################################### - ####################################################################################################################################################### - ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) - my $sequence_fails = 0; - ### Declaring an empty hash reference which will store all information we need for the methylation call - my $methylation_call_params; # hash reference! - ### sorting in ascending order - foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ - - ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment - if (scalar keys %{$mismatches{$mismatch_number}} == 1){ - for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ - $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence}; - $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; - $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position}; - $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; - $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; - } - } - elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){ - ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and - ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a - ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome - ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite - ### reaction. E.g. - ### CAGTCACGCGCGCGCG will become - ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition. - ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave - ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!: - ### G->A conversion: - ### highly methylated: CAATCACACACACACA - ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce - ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the - ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted - ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts. - ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions - ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of - ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment. - ### In the above example the number of transliterations required to transform the actual sequence - ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment) - ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments) - ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments - ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed - ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be - ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded. - my @three_candidate_seqs; - foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){ - my $transliterations_performed; - if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){ - $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT'); - } - elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){ - $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA'); - } - else{ - die "unexpected index number range $!\n"; - } - push @three_candidate_seqs,{ - index =>$mismatches{$mismatch_number}->{$composite_location}->{index}, - bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence}, - mismatch_number => $mismatch_number, - chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome}, - position => $mismatches{$mismatch_number}->{$composite_location}->{position}, - seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id}, - transliterations_performed => $transliterations_performed, - }; - } - ### sorting in ascending order for the lowest number of transliterations performed - @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs; - my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed}; - my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed}; - my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed}; - # print "$first_array_element\t$second_array_element\t$third_array_element\n"; - if (($first_array_element*2) < $second_array_element){ - $counting{low_complexity_alignments_overruled_count}++; - ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits - $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence}; - $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome}; - $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position}; - $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index}; - $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; - # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n"; - } - else{ - $sequence_fails = 1; - } - } - else{ - $sequence_fails = 1; - } - ### after processing the alignment with the lowest number of mismatches we exit - last; - } - ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions - if ($sequence_fails == 1){ - $counting{unsuitable_sequence_count}++; - if ($ambiguous){ - return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified - } - if ($unmapped){ - return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified - } - else{ - return 0; # => exits to next sequence (default) - } - } - - ### --DIRECTIONAL - ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore - ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol - if ($directional){ - if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ - # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; - $counting{alignments_rejected_count}++; - return 0; - } - } - - ### If the sequence has not been rejected so far it will have a unique best alignment - $counting{unique_best_alignment_count}++; - if ($pbat){ - extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params); - } - else{ - extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params); - } - - ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call - if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ - warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; - $counting{genomic_sequence_could_not_be_extracted_count}++; - return 0; - } - - ### otherwise we are set to perform the actual methylation call - $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); - - print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value); - return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out -} - -sub check_bowtie_results_single_end_bowtie2{ - my ($sequence,$identifier,$quality_value) = @_; - - - unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout - $quality_value = 'I'x(length$sequence); - } - - # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name. - # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs - # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n"; - - my $alignment_ambiguous = 0; - - my %alignments = (); - - ### reading from the Bowtie 2 output filehandles - foreach my $index (0..$#fhs){ - # print "Index: $index\n"; - # print "$fhs[$index]->{last_line}\n"; - # print "$fhs[$index]->{last_seq_id}\n"; - # sleep (1); - ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) - next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); - - ### if the sequence we are currently looking at produced an alignment we are doing various things with it - # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n"; - - if ($fhs[$index]->{last_seq_id} eq $identifier) { - # SAM format specifications for Bowtie 2 - # (1) Name of read that aligned - # (2) Sum of all applicable flags. Flags relevant to Bowtie are: - # 1 The read is one of a pair - # 2 The alignment is one end of a proper paired-end alignment - # 4 The read has no reported alignments - # 8 The read is one of a pair and has no reported alignments - # 16 The alignment is to the reverse reference strand - # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand - # 64 The read is mate 1 in a pair - # 128 The read is mate 2 in a pair - # 256 The read has multiple mapping states - # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) - # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) - # (5) Mapping quality (255 means MAPQ is not available) - # (6) CIGAR string representation of alignment (* if unavailable) - # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. - # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. - # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. - # (10) Read sequence (reverse-complemented if aligned to the reverse strand) - # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. - # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: - # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. - # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. - # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. - # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. - # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. - # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. - # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. - # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. - # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. - # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. - - my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10]; - - ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance - if ($flag == 4){ - ## reading in the next alignment, which must be the next sequence - my $newline = $fhs[$index]->{fh}-> getline(); - if ($newline){ - chomp $newline; - my ($seq_id) = split (/\t/,$newline); - $fhs[$index]->{last_seq_id} = $seq_id; - $fhs[$index]->{last_line} = $newline; - if ($seq_id eq $identifier){ - die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; - } - next; # next instance - } - else{ - # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line} = undef; - next; - } - } - - # if there are one or more proper alignments we can extract the chromosome number - my $chromosome; - if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ - $chromosome = $mapped_chromosome; - } - else{ - die "Chromosome number extraction failed for $mapped_chromosome\n"; - } - - ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string - my ($alignment_score,$second_best,$MD_tag); - my @fields = split (/\t/,$fhs[$index]->{last_line}); - - foreach (11..$#fields){ - if ($fields[$_] =~ /AS:i:(.*)/){ - $alignment_score = $1; - } - elsif ($fields[$_] =~ /XS:i:(.*)/){ - $second_best = $1; - } - elsif ($fields[$_] =~ /MD:Z:(.*)/){ - $MD_tag = $1; - } - } - - # warn "First best alignment_score is: '$alignment_score'\n"; - # warn "MD tag is: '$MD_tag'\n"; - die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag); - - if (defined $second_best){ - # warn "second best alignment_score is: '$second_best'\n\n"; - - # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether - if ($alignment_score == $second_best){ - $alignment_ambiguous = 1; - ## need to read and discard all additional ambiguous reads until we reach the next sequence - until ($fhs[$index]->{last_seq_id} ne $identifier){ - my $newline = $fhs[$index]->{fh}-> getline(); - if ($newline){ - chomp $newline; - my ($seq_id) = split (/\t/,$newline); - $fhs[$index]->{last_seq_id} = $seq_id; - $fhs[$index]->{last_line} = $newline; - } - else{ - # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line} = undef; - last; # break free in case we have reached the end of the alignment output - } - } - # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; - } - else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment - - my $alignment_location = join (":",$chromosome,$position); - - ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse - ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite - ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only - ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB - - unless (exists $alignments{$alignment_location}){ - $alignments{$alignment_location}->{seq_id} = $id; - $alignments{$alignment_location}->{alignment_score} = $alignment_score; - $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; - $alignments{$alignment_location}->{index} = $index; - $alignments{$alignment_location}->{chromosome} = $chromosome; - $alignments{$alignment_location}->{position} = $position; - $alignments{$alignment_location}->{CIGAR} = $cigar; - $alignments{$alignment_location}->{MD_tag} = $MD_tag; - } - - ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence - until ($fhs[$index]->{last_seq_id} ne $identifier){ - my $newline = $fhs[$index]->{fh}-> getline(); - if ($newline){ - chomp $newline; - my ($seq_id) = split (/\t/,$newline); - $fhs[$index]->{last_seq_id} = $seq_id; - $fhs[$index]->{last_line} = $newline; - } - else{ - # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line} = undef; - last; # break free in case we have reached the end of the alignment output - } - } - # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; - } - } - else{ # there is no second best hit, so we can just store this one and read in the next sequence - - my $alignment_location = join (":",$chromosome,$position); - - ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse - ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite - ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only - ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB - - unless (exists $alignments{$alignment_location}){ - $alignments{$alignment_location}->{seq_id} = $id; - $alignments{$alignment_location}->{alignment_score} = $alignment_score; - $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; - $alignments{$alignment_location}->{index} = $index; - $alignments{$alignment_location}->{chromosome} = $chromosome; - $alignments{$alignment_location}->{position} = $position; - $alignments{$alignment_location}->{MD_tag} = $MD_tag; - $alignments{$alignment_location}->{CIGAR} = $cigar; - } - - my $newline = $fhs[$index]->{fh}-> getline(); - if ($newline){ - chomp $newline; - my ($seq_id) = split (/\t/,$newline); - $fhs[$index]->{last_seq_id} = $seq_id; - $fhs[$index]->{last_line} = $newline; - if ($seq_id eq $identifier){ - die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; - } - } - else{ - # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line} = undef; - } - } - } - } - - ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out. - if ($alignment_ambiguous == 1){ - $counting{unsuitable_sequence_count}++; - ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else - # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); - # print "$ambiguous_read_output\n"; - - if ($ambiguous){ - return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified - } - elsif ($unmapped){ - return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified - } - else{ - return 0; - } - } - - ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file - unless(%alignments){ - $counting{no_single_alignment_found}++; - # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value); - # print "$unmapped_read_output\n"; - if ($unmapped){ - return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified - } - else{ - return 0; # default - } - } - - ####################################################################################################################################################### - - ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one - ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) - ### alignment score we are discarding the sequence altogether. - ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for - ### opening (5) and extending (3 per bp) the gap. - - ####################################################################################################################################################### - - my $methylation_call_params; # hash reference which will store all information we need for the methylation call - my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) - - ### print contents of %alignments for debugging - # if (scalar keys %alignments > 1){ - # print "\n******\n"; - # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ - # print "Loc: $alignment_location\n"; - # print "ID: $alignments{$alignment_location}->{seq_id}\n"; - # print "AS: $alignments{$alignment_location}->{alignment_score}\n"; - # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n"; - # print "Index $alignments{$alignment_location}->{index}\n"; - # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; - # print "pos: $alignments{$alignment_location}->{position}\n"; - # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n"; - # } - # print "\n******\n"; - # } - - ### if there is only 1 entry in the hash with we accept it as the best alignment - if (scalar keys %alignments == 1){ - for my $unique_best_alignment (keys %alignments){ - $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence}; - $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; - $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position}; - $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; - $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score}; - $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag}; - $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR}; - } - } - - ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case - ### we boot the sequence altogether - elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ - my $best_alignment_score; - my $best_alignment_location; - foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){ - # print "$alignments{$alignment_location}->{alignment_score}\n"; - unless (defined $best_alignment_score){ - $best_alignment_score = $alignments{$alignment_location}->{alignment_score}; - $best_alignment_location = $alignment_location; - # print "setting best alignment score: $best_alignment_score\n"; - } - else{ - ### if the second best alignment has the same alignment score as the first one, the sequence will get booted - if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){ - # warn "Same alignment score, the sequence will get booted!\n"; - $sequence_fails = 1; - last; # exiting after the second alignment since we know that the sequence has ambiguous alignments - } - ### else we are going to store the best alignment for further processing - else{ - $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence}; - $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; - $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position}; - $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; - $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score}; - $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag}; - $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR}; - last; # exiting after processing the second alignment since the sequence produced a unique best alignment - } - } - } - } - else{ - die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";; - } - - ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions - if ($sequence_fails == 1){ - $counting{unsuitable_sequence_count}++; - - ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else - # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); - # print OUT "$ambiguous_read_output\n"; - - if ($ambiguous){ - return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified - } - elsif ($unmapped){ - return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified - } - else{ - return 0; # => exits to next sequence (default) - } - } - - ### --DIRECTIONAL - ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore - ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol - if ($directional){ - if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ - # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; - $counting{alignments_rejected_count}++; - return 0; - } - } - - ### If the sequence has not been rejected so far it has a unique best alignment - $counting{unique_best_alignment_count}++; - - ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well - extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params); - - ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call - if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ - warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; - $counting{genomic_sequence_could_not_be_extracted_count}++; - return 0; - } - - - ### otherwise we are set to perform the actual methylation call - $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); - print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value); - return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out -} - - -sub determine_number_of_transliterations_performed{ - my ($sequence,$read_conversion) = @_; - my $number_of_transliterations; - if ($read_conversion eq 'CT'){ - $number_of_transliterations = $sequence =~ tr/C/T/; - } - elsif ($read_conversion eq 'GA'){ - $number_of_transliterations = $sequence =~ tr/G/A/; - } - else{ - die "Read conversion mode of the read was not specified $!\n"; - } - return $number_of_transliterations; -} - -sub decide_whether_single_end_alignment_is_valid{ - my ($index,$identifier) = @_; - - # extracting from Bowtie 1 format - my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1]; - - ### ensuring that the entry is the correct sequence - if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){ - ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically - ### sensible alignments - my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); - ### If the orientation was correct can we move on - if ($orientation == 1){ - return 1; ### 1st possibility for a sequence to pass - } - ### If the alignment was in the wrong orientation we need to read in a new line - elsif($orientation == 0){ - my $newline = $fhs[$index]->{fh}->getline(); - if ($newline){ - ($id,$strand) = (split (/\t/,$newline))[0,1]; - - ### ensuring that the next entry is still the correct sequence - if ($id eq $identifier){ - ### checking orientation again - $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); - ### If the orientation was correct can we move on - if ($orientation == 1){ - $fhs[$index]->{last_seq_id} = $id; - $fhs[$index]->{last_line} = $newline; - return 1; ### 2nd possibility for a sequence to pass - } - ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs - elsif ($orientation == 0){ - $newline = $fhs[$index]->{fh}->getline(); - if ($newline){ - my ($seq_id) = split (/\t/,$newline); - ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with - ### the same fields of the just read next entry - die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier); - $fhs[$index]->{last_seq_id} = $seq_id; - $fhs[$index]->{last_line} = $newline; - return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation - } - else{ - # assigning undef to last_seq_id and last_line (end of bowtie output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line} = undef; - return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation - } - } - else{ - die "The orientation of the alignment must be either correct or incorrect\n"; - } - } - ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs - else{ - $fhs[$index]->{last_seq_id} = $id; - $fhs[$index]->{last_line} = $newline; - return 0; # processing the new alignment result only in the next round - } - } - else { - # assigning undef to last_seq_id and last_line (end of bowtie output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line} = undef; - return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation - } - } - else{ - die "The orientation of the alignment must be either correct or incorrect\n"; - } - } - ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round - else{ - return 0; - } -} -######################### -### BOWTIE 1 | PAIRED-END -######################### - -sub check_bowtie_results_paired_ends{ - my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; - - ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 - unless ($quality_value_1){ - $quality_value_1 = 'I'x(length$sequence_1); - } - unless ($quality_value_2){ - $quality_value_2 = 'I'x(length$sequence_2); - } - - # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; - # sleep (1); - my %mismatches = (); - ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome - - - ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. - ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). - ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) - ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary - ### strands are not being reported by specifying --directional - - foreach my $index (0,3,1,2){ - ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) - next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); - ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it - if ($fhs[$index]->{last_seq_id} eq $identifier) { - # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n"; - - ################################################################################## - ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ### - ################################################################################## - my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); - ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong - ### orientation. We only continue to extract useful information about this alignment if 1 was returned - if ($valid_alignment_found == 1){ - ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself. - ### we store the useful information in %mismatches - my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; - my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; - chomp $mismatch_info_1; - chomp $mismatch_info_2; - - ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted - my ($chromosome_1,$chromosome_2); - if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ - $chromosome_1 = $mapped_chromosome_1; - } - else{ - die "Chromosome number extraction failed for $mapped_chromosome_1\n"; - } - if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ - $chromosome_2 = $mapped_chromosome_2; - } - else{ - die "Chromosome number extraction failed for $mapped_chromosome_2\n"; - } - - ### Now extracting the number of mismatches to the converted genome - my $number_of_mismatches_1; - my $number_of_mismatches_2; - if ($mismatch_info_1 eq ''){ - $number_of_mismatches_1 = 0; - } - elsif ($mismatch_info_1 =~ /^\d/){ - my @mismatches = split (/,/,$mismatch_info_1); - $number_of_mismatches_1 = scalar @mismatches; - } - else{ - die "Something weird is going on with the mismatch field\n"; - } - if ($mismatch_info_2 eq ''){ - $number_of_mismatches_2 = 0; - } - elsif ($mismatch_info_2 =~ /^\d/){ - my @mismatches = split (/,/,$mismatch_info_2); - $number_of_mismatches_2 = scalar @mismatches; - } - else{ - die "Something weird is going on with the mismatch field\n"; - } - ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments - my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; - ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table - die "Position 1 is higher than position 2" if ($position_1 > $position_2); - die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); - my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); - ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse - ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same - ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index - ### number for the found alignment) - unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ - $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine - $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine - $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; - } - ################################################################################################################################################### - ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ### - ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ### - ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ### - ### this round ### - ################################################################################################################################################### - my $newline_1 = $fhs[$index]->{fh}-> getline(); - my $newline_2 = $fhs[$index]->{fh}-> getline(); - - if ($newline_1 and $newline_2){ - my ($seq_id_1) = split (/\t/,$newline_1); - my ($seq_id_2) = split (/\t/,$newline_2); - - if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag - $fhs[$index]->{last_seq_id} = $seq_id_1; - } - elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag - $fhs[$index]->{last_seq_id} = $seq_id_2; - } - else{ - die "Either read 1 or read 2 needs to end on '/1'\n"; - } - - $fhs[$index]->{last_line_1} = $newline_1; - $fhs[$index]->{last_line_2} = $newline_2; - } - else { - # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line_1} = undef; - $fhs[$index]->{last_line_2} = undef; - next; # jumping to the next index - } - ### Now processing the entry we just stored in last_line_1 and last_line_2 - $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); - ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to - ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation - if ($valid_alignment_found == 1){ - ### we store the useful information in %mismatches - ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7]; - ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7]; - chomp $mismatch_info_1; - chomp $mismatch_info_2; - ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted) - if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ - $chromosome_1 = $mapped_chromosome_1; - } - else{ - die "Chromosome number extraction failed for $mapped_chromosome_1\n"; - } - if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ - $chromosome_2 = $mapped_chromosome_2; - } - else{ - die "Chromosome number extraction failed for $mapped_chromosome_2\n"; - } - - $number_of_mismatches_1=''; - $number_of_mismatches_2=''; - ### Now extracting the number of mismatches to the converted genome - if ($mismatch_info_1 eq ''){ - $number_of_mismatches_1 = 0; - } - elsif ($mismatch_info_1 =~ /^\d/){ - my @mismatches = split (/,/,$mismatch_info_1); - $number_of_mismatches_1 = scalar @mismatches; - } - else{ - die "Something weird is going on with the mismatch field\n"; - } - if ($mismatch_info_2 eq ''){ - $number_of_mismatches_2 = 0; - } - elsif ($mismatch_info_2 =~ /^\d/){ - my @mismatches = split (/,/,$mismatch_info_2); - $number_of_mismatches_2 = scalar @mismatches; - } - else{ - die "Something weird is going on with the mismatch field\n"; - } - ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments - $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; - ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table - die "position 1 is greater than position 2" if ($position_1 > $position_2); - die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); - $alignment_location = join(":",$chromosome_1,$position_1,$position_2); - ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse - ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same - ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index - ### number for the found alignment) - unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ - $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine - $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine - $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; - $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; - } - ############################################################################################################################################### - ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ### - ############################################################################################################################################### - $newline_1 = $fhs[$index]->{fh}-> getline(); - $newline_2 = $fhs[$index]->{fh}-> getline(); - - if ($newline_1 and $newline_2){ - my ($seq_id_1) = split (/\t/,$newline_1); - my ($seq_id_2) = split (/\t/,$newline_2); - - if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag - $fhs[$index]->{last_seq_id} = $seq_id_1; - } - if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag - $fhs[$index]->{last_seq_id} = $seq_id_2; - } - $fhs[$index]->{last_line_1} = $newline_1; - $fhs[$index]->{last_line_2} = $newline_2; - } - else { - # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line_1} = undef; - $fhs[$index]->{last_line_2} = undef; - next; # jumping to the next index - } - ### within the 2nd sequence pair alignment in correct orientation found - } - ### within the 1st sequence pair alignment in correct orientation found - } - ### still within the (last_seq_id eq identifier) condition - } - ### still within foreach index loop - } - ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file - unless(%mismatches){ - $counting{no_single_alignment_found}++; - return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified - } - ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) - my $sequence_pair_fails = 0; - ### Declaring an empty hash reference which will store all information we need for the methylation call - my $methylation_call_params; # hash reference! - ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the - ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the - ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether - foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ - #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n"; - foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){ - #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n"; - # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n"; - } - if (scalar keys %{$mismatches{$mismatch_number}} == 1){ - # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n"; - for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ - $methylation_call_params->{$identifier}->{seq_id} = $identifier; - $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1}; - $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}; - $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; - $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1}; - $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}; - $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2})); - $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; - $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1}; - $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2}; - } - } - else{ - $sequence_pair_fails = 1; - } - ### after processing the alignment with the lowest number of mismatches we exit - last; - } - ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions - if ($sequence_pair_fails == 1){ - $counting{unsuitable_sequence_count}++; - if ($ambiguous){ - return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified - } - if ($unmapped){ - return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified - } - else{ - return 0; # => exits to next sequence (default) - } - } - - ### --DIRECTIONAL - ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore - ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol - if ($directional){ - if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ - # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; - $counting{alignments_rejected_count}++; - return 0; - } - } - - ### If the sequence has not been rejected so far it does have a unique best alignment - $counting{unique_best_alignment_count}++; - extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params); - - ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call - if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ - warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; - $counting{genomic_sequence_could_not_be_extracted_count}++; - return 0; - } - if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ - warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; - $counting{genomic_sequence_could_not_be_extracted_count}++; - return 0; - } - - ### otherwise we are set to perform the actual methylation call - $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); - $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); - - print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); - return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 -} - -######################### -### BOWTIE 2 | PAIRED-END -######################### - -sub check_bowtie_results_paired_ends_bowtie2{ - my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; - - ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 - unless ($quality_value_1){ - $quality_value_1 = 'I'x(length$sequence_1); - } - - unless ($quality_value_2){ - $quality_value_2 = 'I'x(length$sequence_2); - } - - - # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; - - - my %alignments; - my $alignment_ambiguous = 0; - - ### reading from the Bowtie 2 output filehandles - - ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. - ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). - ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) - ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary - ### strands are not being reported when '--directional' is specified - - foreach my $index (0,3,1,2){ - ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) - next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); - - ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it - if ($fhs[$index]->{last_seq_id} eq $identifier) { - - my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10]; - my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10]; - # print "Index: $index\t$fhs[$index]->{last_line_1}\n"; - # print "Index: $index\t$fhs[$index]->{last_line_2}\n"; - # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n"; - # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n"; - $id_1 =~ s/\/1$//; - $id_2 =~ s/\/2$//; - - # SAM format specifications for Bowtie 2 - # (1) Name of read that aligned - # (2) Sum of all applicable flags. Flags relevant to Bowtie are: - # 1 The read is one of a pair - # 2 The alignment is one end of a proper paired-end alignment - # 4 The read has no reported alignments - # 8 The read is one of a pair and has no reported alignments - # 16 The alignment is to the reverse reference strand - # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand - # 64 The read is mate 1 in a pair - # 128 The read is mate 2 in a pair - # 256 The read has multiple mapping states - # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) - # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) - # (5) Mapping quality (255 means MAPQ is not available) - # (6) CIGAR string representation of alignment (* if unavailable) - # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. - # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. - # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. - # (10) Read sequence (reverse-complemented if aligned to the reverse strand) - # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. - # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: - # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. - # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. - # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. - # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. - # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. - # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. - # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. - # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. - # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. - # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. - - ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128). - ### We can store the next alignment and move on to the next Bowtie 2 instance - if ($flag_1 == 77 and $flag_2 == 141){ - ## reading in the next alignment, which must be the next sequence - my $newline_1 = $fhs[$index]->{fh}-> getline(); - my $newline_2 = $fhs[$index]->{fh}-> getline(); - - if ($newline_1 and $newline_2){ - chomp $newline_1; - chomp $newline_2; - my ($seq_id_1) = split (/\t/,$newline_1); - my ($seq_id_2) = split (/\t/,$newline_2); - $seq_id_1 =~ s/\/1$//; - $seq_id_2 =~ s/\/2$//; - $fhs[$index]->{last_seq_id} = $seq_id_1; - $fhs[$index]->{last_line_1} = $newline_1; - $fhs[$index]->{last_line_2} = $newline_2; - - # print "current sequence ($identifier) did not map, reading in next sequence\n"; - # print "$index\t$fhs[$index]->{last_seq_id}\n"; - # print "$index\t$fhs[$index]->{last_line_1}\n"; - # print "$index\t$fhs[$index]->{last_line_2}\n"; - next; # next instance - } - else{ - # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line_1} = undef; - $fhs[$index]->{last_line_2} = undef; - next; - } - } - - ### If there are one or more proper alignments we can extract the chromosome number - my ($chromosome_1,$chromosome_2); - if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ - $chromosome_1 = $mapped_chromosome_1; - } - else{ - die "Chromosome number extraction failed for $mapped_chromosome_1\n"; - } - if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ - $chromosome_2 = $mapped_chromosome_2; - } - else{ - die "Chromosome number extraction failed for $mapped_chromosome_2\n"; - } - - die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); - - ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string - my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2); - - my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1}); - my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2}); - - foreach (11..$#fields_1){ - if ($fields_1[$_] =~ /AS:i:(.*)/){ - $alignment_score_1 = $1; - } - elsif ($fields_1[$_] =~ /XS:i:(.*)/){ - $second_best_1 = $1; - } - elsif ($fields_1[$_] =~ /MD:Z:(.*)/){ - $MD_tag_1 = $1; - } - } - - foreach (11..$#fields_2){ - if ($fields_2[$_] =~ /AS:i:(.*)/){ - $alignment_score_2 = $1; - } - elsif ($fields_2[$_] =~ /XS:i:(.*)/){ - $second_best_2 = $1; - } - elsif ($fields_2[$_] =~ /MD:Z:(.*)/){ - $MD_tag_2 = $1; - } - } - - die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1); - die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2); - - # warn "First read 1 alignment score is: '$alignment_score_1'\n"; - # warn "First read 2 alignment score is: '$alignment_score_2'\n"; - # warn "MD tag 1 is: '$MD_tag_1'\n"; - # warn "MD tag 2 is: '$MD_tag_2'\n"; - - ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments - my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ; - # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n"; - - if (defined $second_best_1 and defined $second_best_2){ - my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2; - # warn "Second best alignment_score_1 is: '$second_best_1'\n"; - # warn "Second best alignment_score_2 is: '$second_best_2'\n"; - # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n"; - - # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether - if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){ - $alignment_ambiguous = 1; - # print "This read will be chucked (AS==XS detected)!\n"; - - ## need to read and discard all additional ambiguous reads until we reach the next sequence - until ($fhs[$index]->{last_seq_id} ne $identifier){ - my $newline_1 = $fhs[$index]->{fh}-> getline(); - my $newline_2 = $fhs[$index]->{fh}-> getline(); - if ($newline_1 and $newline_2){ - chomp $newline_1; - chomp $newline_2; - my ($seq_id_1) = split (/\t/,$newline_1); - my ($seq_id_2) = split (/\t/,$newline_2); - $seq_id_1 =~ s/\/1$//; - $seq_id_2 =~ s/\/2$//; - # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; - - $fhs[$index]->{last_seq_id} = $seq_id_1; - $fhs[$index]->{last_line_1} = $newline_1; - $fhs[$index]->{last_line_2} = $newline_2; - } - else{ - # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line_1} = undef; - $fhs[$index]->{last_line_2} = undef; - last; # break free if the end of the alignment output was reached - } - } - # if ($fhs[$index]->{last_seq_id}){ - # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; - # } - } - else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment - - my $alignment_location; - if ($position_1 <= $position_2){ - $alignment_location = join(":",$chromosome_1,$position_1,$position_2); - } - elsif($position_2 < $position_1){ - $alignment_location = join(":",$chromosome_1,$position_2,$position_1); - } - - ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse - ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite - ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only - ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB - - unless (exists $alignments{$alignment_location}){ - $alignments{$alignment_location}->{seq_id} = $id_1; - $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; - $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; - $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; - $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; - $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; - $alignments{$alignment_location}->{index} = $index; - $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine - $alignments{$alignment_location}->{position_1} = $position_1; - $alignments{$alignment_location}->{position_2} = $position_2; - $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; - $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; - $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; - $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; - $alignments{$alignment_location}->{flag_1} = $flag_1; - $alignments{$alignment_location}->{flag_2} = $flag_2; - } - # warn "added best of several alignments to \%alignments hash\n"; - - ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence - until ($fhs[$index]->{last_seq_id} ne $identifier){ - my $newline_1 = $fhs[$index]->{fh}-> getline(); - my $newline_2 = $fhs[$index]->{fh}-> getline(); - if ($newline_1 and $newline_2){ - chomp $newline_1; - chomp $newline_2; - my ($seq_id_1) = split (/\t/,$newline_1); - my ($seq_id_2) = split (/\t/,$newline_2); - $seq_id_1 =~ s/\/1$//; - $seq_id_2 =~ s/\/2$//; - # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; - - $fhs[$index]->{last_seq_id} = $seq_id_1; - $fhs[$index]->{last_line_1} = $newline_1; - $fhs[$index]->{last_line_2} = $newline_2; - } - else{ - # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line_1} = undef; - $fhs[$index]->{last_line_2} = undef; - last; # break free if the end of the alignment output was reached - } - } - # if($fhs[$index]->{last_seq_id}){ - # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n"; - # } - } - } - else{ # there is no second best hit, so we can just store this one and read in the next sequence - - my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); - # print "$alignment_location\n"; - ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse - ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite - ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only - ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB - - unless (exists $alignments{$alignment_location}){ - $alignments{$alignment_location}->{seq_id} = $id_1; - $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; - $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; - $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; - $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; - $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; - $alignments{$alignment_location}->{index} = $index; - $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine - $alignments{$alignment_location}->{position_1} = $position_1; - $alignments{$alignment_location}->{position_2} = $position_2; - $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; - $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; - $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; - $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; - $alignments{$alignment_location}->{flag_1} = $flag_1; - $alignments{$alignment_location}->{flag_2} = $flag_2; - } - - # warn "added unique alignment to \%alignments hash\n"; - - # Now reading and storing the next read pair - my $newline_1 = $fhs[$index]->{fh}-> getline(); - my $newline_2 = $fhs[$index]->{fh}-> getline(); - if ($newline_1 and $newline_2){ - chomp $newline_1; - chomp $newline_2; - # print "$newline_1\n"; - # print "$newline_2\n"; - my ($seq_id_1) = split (/\t/,$newline_1); - my ($seq_id_2) = split (/\t/,$newline_2); - $seq_id_1 =~ s/\/1$//; - $seq_id_2 =~ s/\/2$//; - # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; - - $fhs[$index]->{last_seq_id} = $seq_id_1; - $fhs[$index]->{last_line_1} = $newline_1; - $fhs[$index]->{last_line_2} = $newline_2; - - if ($seq_id_1 eq $identifier){ - die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; - } - } - else{ - # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line_1} = undef; - $fhs[$index]->{last_line_2} = undef; - } - } - } - } - - ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format - if ($alignment_ambiguous == 1){ - $counting{unsuitable_sequence_count}++; - ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else - # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); - # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); - # print "$ambiguous_read_1\n"; - # print "$ambiguous_read_2\n"; - - if ($ambiguous){ - return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified - } - elsif ($unmapped){ - return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified - } - else{ - return 0; - } - } - - ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file - unless (%alignments){ - $counting{no_single_alignment_found}++; - - # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); - # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); - # print "$unmapped_read_1\n"; - # print "$unmapped_read_2\n"; - if ($unmapped){ - return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified - } - else{ - return 0; - } - } - - ####################################################################################################################################################### - - ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one - ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) - ### alignment score we are discarding the sequence pair altogether. - ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5) - ### and extending (3 per bp) the gap. - - ####################################################################################################################################################### - - ### Declaring an empty hash reference which will store all information we need for the methylation call - my $methylation_call_params; # hash reference - my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) - - ### print contents of %alignments for debugging - ## if (scalar keys %alignments >= 1){ - # print "\n******\n"; - # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ - # print "Loc: $alignment_location\n"; - # print "ID: $alignments{$alignment_location}->{seq_id}\n"; - # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n"; - # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n"; - # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n"; - # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n"; - # print "Index $alignments{$alignment_location}->{index}\n"; - # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; - # print "Pos_1: $alignments{$alignment_location}->{position_1}\n"; - # print "Pos_2: $alignments{$alignment_location}->{position_2}\n"; - # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n"; - # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n"; - # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n"; - # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n"; - # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n"; - # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n"; - # } - # print "\n******\n"; - # } - - ### if there is only 1 entry in the %alignments hash we accept it as the best alignment - if (scalar keys %alignments == 1){ - for my $unique_best_alignment (keys %alignments){ - $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1}; - $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2}; - $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; - $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1}; - $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2}; - $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; - $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1}; - $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2}; - $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores}; - $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1}; - $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2}; - $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1}; - $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2}; - $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1}; - $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2}; - } - } - - ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case - ### we boot the sequence pair altogether) - elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ - my $best_sum_of_alignment_scores; - my $best_alignment_location; - foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){ - # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; - unless (defined $best_sum_of_alignment_scores){ - $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores}; - $best_alignment_location = $alignment_location; - # print "setting best alignment score to: $best_sum_of_alignment_scores\n"; - } - else{ - ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted - if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){ - # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n"; - $sequence_pair_fails = 1; - last; # exiting since we know that the sequence has ambiguous alignments - } - ### else we are going to store the best alignment for further processing - else{ - $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1}; - $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2}; - $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; - $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1}; - $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2}; - $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; - $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1}; - $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2}; - $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores}; - $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1}; - $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2}; - $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1}; - $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2}; - $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1}; - $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2}; - last; # exiting since the sequence produced a unique best alignment - } - } - } - } - else{ - die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";; - } - - ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions - if ($sequence_pair_fails == 1){ - $counting{unsuitable_sequence_count}++; - - ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else - # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); - # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); - # print "$ambiguous_read_1\n"; - # print "$ambiguous_read_2\n"; - - if ($ambiguous){ - return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified - } - elsif ($unmapped){ - return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified - } - else{ - return 0; # => exits to next sequence pair (default) - } - } - - ### --DIRECTIONAL - ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore - ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol - if ($directional){ - if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ - # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; - $counting{alignments_rejected_count}++; - return 0; - } - } - - ### If the sequence pair has not been rejected so far it does have a unique best alignment - $counting{unique_best_alignment_count}++; - extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); - - ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call - if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ - warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; - $counting{genomic_sequence_could_not_be_extracted_count}++; - return 0; - } - if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ - warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; - $counting{genomic_sequence_could_not_be_extracted_count}++; - return 0; - } - - ### now we are set to perform the actual methylation call - $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); - $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); - # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n"; - # print " $sequence_2\n"; - # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n"; - # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n"; - - print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); - return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 -} - -### - -sub decide_whether_paired_end_alignment_is_valid{ - my ($index,$identifier) = @_; - my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; - my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; - chomp $mismatch_info_1; - chomp $mismatch_info_2; - my $seq_id_1 = $id_1; - my $seq_id_2 = $id_2; - $seq_id_1 =~ s/\/1$//; # removing the read /1 - $seq_id_2 =~ s/\/1$//; # removing the read /1 - - ### ensuring that the current entry is the correct sequence - if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ - ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically - ### sensible alignments - my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); - ### If the orientation was correct can we move on - if ($orientation == 1){ - return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS - } - ### If the alignment was in the wrong orientation we need to read in two new lines - elsif($orientation == 0){ - my $newline_1 = $fhs[$index]->{fh}->getline(); - my $newline_2 = $fhs[$index]->{fh}->getline(); - if ($newline_1 and $newline_2){ - ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time) - ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1]; - ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1]; - - my $seqid; - $seq_id_1 = $id_1; - $seq_id_2 = $id_2; - # we need to capture the first read (ending on /1) - if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag - $seqid = $seq_id_1; - } - elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag - $seqid = $seq_id_2; - } - else{ - die "One of the two reads needs to end on /1!!"; - } - - ### ensuring that the next entry is still the correct sequence - if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ - ### checking orientation again - $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); - ### If the orientation was correct can we move on - if ($orientation == 1){ - ### Writing the current sequence to last_line_1 and last_line_2 - $fhs[$index]->{last_seq_id} = $seqid; - $fhs[$index]->{last_line_1} = $newline_1; - $fhs[$index]->{last_line_2} = $newline_2; - return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS - } - ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be - ### the next entry) - elsif ($orientation == 0){ - $newline_1 = $fhs[$index]->{fh}->getline(); - $newline_2 = $fhs[$index]->{fh}->getline(); - if ($newline_1 and $newline_2){ - ($seq_id_1) = split (/\t/,$newline_1); - ($seq_id_2) = split (/\t/,$newline_2); - - $seqid = ''; - if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag - $seqid = $seq_id_1; - } - elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag - $seqid = $seq_id_2; - } - else{ - die "One of the two reads needs to end on /1!!"; - } - - ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with - ### the same fields of the just read next entry - die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier); - $fhs[$index]->{last_seq_id} = $seqid; - $fhs[$index]->{last_line_1} = $newline_1; - $fhs[$index]->{last_line_2} = $newline_2; - return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation - } - else { - ### assigning undef to last_seq_id and last_line (end of bowtie output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line_1} = undef; - $fhs[$index]->{last_line_2} = undef; - return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation - } - } - else{ - die "The orientation of the alignment must be either correct or incorrect\n"; - } - } - ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs - else{ - $fhs[$index]->{last_seq_id} = $seqid; - $fhs[$index]->{last_line_1} = $newline_1; - $fhs[$index]->{last_line_2} = $newline_2; - return 0; # processing the new alignment result only in the next round - } - } - else { - # assigning undef to last_seq_id and both last_lines (end of bowtie output) - $fhs[$index]->{last_seq_id} = undef; - $fhs[$index]->{last_line_1} = undef; - $fhs[$index]->{last_line_2} = undef; - return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation - } - } - else{ - die "The orientation of the alignment must be either correct or incorrect\n"; - } - } - ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round - else{ - return 0; - } -} - -### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END - -sub extract_corresponding_genomic_sequence_paired_ends { - my ($sequence_identifier,$methylation_call_params) = @_; - ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the - ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call - my $alignment_read_1; - my $alignment_read_2; - my $read_conversion_info_1; - my $read_conversion_info_2; - my $genome_conversion; - - ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call - ### if the C happens to be at the first or last position of the actually observed sequence - my $non_bisulfite_sequence_1; - my $non_bisulfite_sequence_2; - - ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was - ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic - ### sequences around! - ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) - if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ - ### [Index 0, sequence originated from (converted) forward strand] - $counting{CT_GA_CT_count}++; - $alignment_read_1 = '+'; - $alignment_read_2 = '-'; - $read_conversion_info_1 = 'CT'; - $read_conversion_info_2 = 'GA'; - $genome_conversion = 'CT'; - ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) - ### for hits on the forward strand we need to capture 2 extra bases at the 3' end - - $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change - - ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) - ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation - if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1 - - $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); - ### the reverse strand sequence needs to be reverse complemented - $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); - } - else{ - $non_bisulfite_sequence_2 = ''; - } - } - - ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ - ### [Index 1, sequence originated from complementary to (converted) reverse strand] - $counting{GA_CT_GA_count}++; - $alignment_read_1 = '+'; - $alignment_read_2 = '-'; - $read_conversion_info_1 = 'GA'; - $read_conversion_info_2 = 'CT'; - $genome_conversion = 'GA'; - - ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) - ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end - if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1 - $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2 - } - else{ - $non_bisulfite_sequence_1 = ''; - } - - ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) - ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation - $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2 - ### the reverse strand sequence needs to be reverse complemented - $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); - } - - ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ - ### [Index 2, sequence originated from the complementary to (converted) forward strand] - $counting{GA_CT_CT_count}++; - $alignment_read_1 = '-'; - $alignment_read_2 = '+'; - $read_conversion_info_1 = 'GA'; - $read_conversion_info_2 = 'CT'; - $genome_conversion = 'CT'; - - ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! - ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand - ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation - $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2 - ### the reverse strand sequence needs to be reverse complemented - $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); - - ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) - ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! - ### Read 2 is CT converted so we need to capture 2 extra 3' bases - if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1 - $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2 - } - else{ - $non_bisulfite_sequence_2 = ''; - } - } - - ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ - ### [Index 3, sequence originated from the (converted) reverse strand] - $counting{CT_GA_GA_count}++; - $alignment_read_1 = '-'; - $alignment_read_2 = '+'; - $read_conversion_info_1 = 'CT'; - $read_conversion_info_2 = 'GA'; - $genome_conversion = 'GA'; - - ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! - ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand - ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation - if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1 - $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2 - ### the reverse strand sequence needs to be reverse complemented - $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); - } - else{ - $non_bisulfite_sequence_1 = ''; - } - - ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) - ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! - ### Read 2 is GA converted so we need to capture 2 extra 5' bases - $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2 - } - else{ - die "Too many bowtie result filehandles\n"; - } - ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, - ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions - - $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; - $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; - $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; - $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; - $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; -} - -### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END - -sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{ - my ($sequence_identifier,$methylation_call_params) = @_; - ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the - ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call - - my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; - my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; - my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; - my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; - # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; - # sleep(10); - ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and - ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence - - ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, - ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions - my $alignment_read_1; - my $alignment_read_2; - my $read_conversion_info_1; - my $read_conversion_info_2; - my $genome_conversion; - - ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call - ### if the C happens to be at the last position of the actually observed sequence - my $non_bisulfite_sequence_1 = ''; - my $non_bisulfite_sequence_2 = ''; - - ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings - my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1; - my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1; - - # parsing CIGAR 1 string - my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation - my @ops_1 = split (/\d+/,$cigar_1); # storing the operation - shift @ops_1; # remove the empty first element - die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); - # parsing CIGAR 2 string - my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation - my @ops_2 = split (/\d+/,$cigar_2); # storing the operation - shift @ops_2; # remove the empty first element - die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); - - my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output - my $indels_2 = 0; - - ### Extracting read 1 genomic sequence ### - - # extracting 2 additional bp at the 5' end (read 1) - if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ - # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; - return; - } - $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2); - } - - foreach (0..$#len_1){ - if ($ops_1[$_] eq 'M'){ - # extracting genomic sequence - $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); - # warn "$non_bisulfite_sequence_1\n"; - # adjusting position - $pos_1 += $len_1[$_]; - } - elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence - # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls - $non_bisulfite_sequence_1 .= 'N' x $len_1[$_]; - # warn "$non_bisulfite_sequence_1\n"; - # position doesn't need adjusting - $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output - } - elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence - # we do not add any genomic sequence but only adjust the position - # warn "Just adjusting the position by: ",$len_1[$_],"bp\n"; - $pos_1 += $len_1[$_]; - $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output - } - elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die - die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; - } - else{ - die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; - } - } - - ### 3' end of read 1 - if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; - return; - } - - $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); - } - - - ### Extracting read 2 genomic sequence ### - - ### 5' end of read 2 - if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; - return; - } - $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2); - } - - foreach (0..$#len_2){ - if ($ops_2[$_] eq 'M'){ - # extracting genomic sequence - $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); - # warn "$non_bisulfite_sequence_2\n"; - # adjusting position - $pos_2 += $len_2[$_]; - } - elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence - # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls - $non_bisulfite_sequence_2 .= 'N' x $len_2[$_]; - # warn "$non_bisulfite_sequence_2\n"; - # position doesn't need adjusting - $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output - } - elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence - # we do not add any genomic sequence but only adjust the position - # warn "Just adjusting the position by: ",$len_2[$_],"bp\n"; - $pos_2 += $len_2[$_]; - $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output - } - elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die - die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; - } - else{ - die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; - } - } - - ### 3' end of read 2 - if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise - # need to set read 1 as well now to prevent warning - # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; - # sleep(5); - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; - return; - } - $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); - } - - ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was - ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly - - ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) - if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ - ### [Index 0, sequence originated from (converted) forward strand] - $counting{CT_GA_CT_count}++; - $alignment_read_1 = '+'; - $alignment_read_2 = '-'; - $read_conversion_info_1 = 'CT'; - $read_conversion_info_2 = 'GA'; - $genome_conversion = 'CT'; - ### Read 1 is always the forward hit - ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented - $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); - } - - ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ - ### [Index 1, sequence originated from complementary to (converted) bottom strand] - $counting{GA_CT_GA_count}++; - $alignment_read_1 = '+'; - $alignment_read_2 = '-'; - $read_conversion_info_1 = 'GA'; - $read_conversion_info_2 = 'CT'; - $genome_conversion = 'GA'; - ### Read 1 is always the forward hit - ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented - $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); - } - - ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ - ### [Index 2, sequence originated from the complementary to (converted) top strand] - $counting{GA_CT_CT_count}++; - $alignment_read_1 = '-'; - $alignment_read_2 = '+'; - $read_conversion_info_1 = 'GA'; - $read_conversion_info_2 = 'CT'; - $genome_conversion = 'CT'; - - ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented - $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); - } - - ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ - ### [Index 3, sequence originated from the (converted) reverse strand] - $counting{CT_GA_GA_count}++; - $alignment_read_1 = '-'; - $alignment_read_2 = '+'; - $read_conversion_info_1 = 'CT'; - $read_conversion_info_2 = 'GA'; - $genome_conversion = 'GA'; - ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented - $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); - } - else{ - die "Too many bowtie result filehandles\n"; - } - ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, - ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions - - $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; - $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; - $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; - $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; - $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; - ## the end position of a read is stored in $pos - $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1; - $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2; - $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1; - $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2; -} - -########################################## -### PRINT SINGLE END RESULTS: Bowtie 1 ### -########################################## - -sub print_bisulfite_mapping_result_single_end{ - my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; - - ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) - if ($phred64){ - $quality_value = convert_phred64_quals_to_phred33($quality_value); - } - elsif ($solexa){ - $quality_value = convert_solexa_quals_to_phred33($quality_value); - } - - ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position. - $methylation_call_params->{$identifier}->{position} += 1; - - ### writing every uniquely mapped read and its methylation call to the output file - if ($vanilla){ - my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value); - print OUT "$bowtie1_output\n"; - } - else{ # SAM output, default since Bismark v1.0.0 - single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script - } -} - -########################################## -### PRINT SINGLE END RESULTS: Bowtie 2 ### -########################################## - -sub print_bisulfite_mapping_result_single_end_bowtie2{ - my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; - - ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) - if ($phred64){ - $quality_value = convert_phred64_quals_to_phred33($quality_value); - } - elsif ($solexa){ - $quality_value = convert_solexa_quals_to_phred33($quality_value); - } - - ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed) - single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script -} - -########################################## -### PRINT PAIRED END ESULTS: Bowtie 1 ### -########################################## - -sub print_bisulfite_mapping_results_paired_ends{ - my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; - - ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) - if ($phred64){ - $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); - $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); - } - elsif ($solexa){ - $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); - $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); - } - - ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based) - $methylation_call_params->{$identifier}->{start_seq_1} += 1; - - ### writing every single aligned read and its methylation call to the output file - if ($vanilla){ - my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2); - print OUT "$bowtie1_output_paired_end\n"; - } - else{ # SAM output, default since Bismark v1.0.0 - paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script - } - -} - -########################################## -### PRINT PAIRED END ESULTS: Bowtie 2 ### -########################################## - -sub print_bisulfite_mapping_results_paired_ends_bowtie2{ - my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; - - ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) - if ($phred64){ - $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); - $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); - } - elsif ($solexa){ - $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); - $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); - } - - ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed) - paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script - -} - - -sub convert_phred64_quals_to_phred33{ - - my $qual = shift; - my @quals = split (//,$qual); - my @new_quals; - - foreach my $index (0..$#quals){ - my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]); - my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); - $new_quals[$index] = $phred33_quality_string; - } - - my $phred33_quality = join ("",@new_quals); - return $phred33_quality; -} - -sub convert_solexa_quals_to_phred33{ - - my $qual = shift; - my @quals = split (//,$qual); - my @new_quals; - - foreach my $index (0..$#quals){ - my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]); - my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); - $new_quals[$index] = $phred33_quality_string; - } - - my $phred33_quality = join ("",@new_quals); - return $phred33_quality; -} - -sub convert_phred_score_into_phred33_quality_string{ - my $qual = shift; - $qual = chr($qual+33); - return $qual; -} - -sub convert_phred64_quality_string_into_phred_score{ - my $string = shift; - my $qual = ord($string)-64; - return $qual; -} - -sub convert_solexa_pre1_3_quality_string_into_phred_score{ - ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10 - my $string = shift; - my $qual = ord($string)-59; - return $qual; -} - - -sub extract_corresponding_genomic_sequence_single_end { - my ($sequence_identifier,$methylation_call_params) = @_; - ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the - ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call - - ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, - ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions - my $alignment_strand; - my $read_conversion_info; - my $genome_conversion; - ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and - ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, - ### if the C happens to be at the last position of the actually observed sequence - my $non_bisulfite_sequence; - ### depending on the conversion we want to make need to capture 1 extra base at the 3' end - - ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) - if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ - ### [Index 0, sequence originated from (converted) forward strand] - $counting{CT_CT_count}++; - $alignment_strand = '+'; - $read_conversion_info = 'CT'; - $genome_conversion = 'CT'; - - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 - ### + 2 extra base at the 3' end - $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 - } - else{ - $non_bisulfite_sequence = ''; - } - } - - ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ - ### [Index 1, sequence originated from (converted) reverse strand] - $counting{CT_GA_count}++; - $alignment_strand = '-'; - $read_conversion_info = 'CT'; - $genome_conversion = 'GA'; - - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > - ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation - $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 - ## reverse complement! - $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); - } - else{ - $non_bisulfite_sequence = ''; - } - } - - ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ - ### [Index 2, sequence originated from complementary to (converted) forward strand] - $counting{GA_CT_count}++; - $alignment_strand = '-'; - $read_conversion_info = 'GA'; - $genome_conversion = 'CT'; - - ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 - $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 - ## reverse complement! - $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); - } - else{ - $non_bisulfite_sequence = ''; - } - } - - ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ - ### [Index 3, sequence originated from complementary to (converted) reverse strand] - $counting{GA_GA_count}++; - $alignment_strand = '+'; - $read_conversion_info = 'GA'; - $genome_conversion = 'GA'; - - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > - ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand - $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 - } - else{ - $non_bisulfite_sequence = ''; - } - } - else{ - die "Too many bowtie result filehandles\n"; - } - - $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; - $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; - $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; - - ### at this point we can also determine the end position of a read - $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); -} - -sub extract_corresponding_genomic_sequence_single_end_pbat { - my ($sequence_identifier,$methylation_call_params) = @_; - ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the - ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call - - ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, - ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions - my $alignment_strand; - my $read_conversion_info; - my $genome_conversion; - ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and - ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, - ### if the C happens to be at the last position of the actually observed sequence - my $non_bisulfite_sequence; - ### depending on the conversion we want to make need to capture 1 extra base at the 3' end - - my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1! - - ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) - if ($pbat_index == 0){ - ### [Index 0, sequence originated from (converted) forward strand] - $counting{CT_CT_count}++; - $alignment_strand = '+'; - $read_conversion_info = 'CT'; - $genome_conversion = 'CT'; - - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 - ### + 2 extra base at the 3' end - $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 - } - else{ - $non_bisulfite_sequence = ''; - } - } - - ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) - elsif ($pbat_index == 1){ - ### [Index 1, sequence originated from (converted) reverse strand] - $counting{CT_GA_count}++; - $alignment_strand = '-'; - $read_conversion_info = 'CT'; - $genome_conversion = 'GA'; - - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > - ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation - $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 - ## reverse complement! - $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); - } - else{ - $non_bisulfite_sequence = ''; - } - } - - ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) - elsif ($pbat_index == 2){ - ### [Index 2, sequence originated from complementary to (converted) forward strand] - $counting{GA_CT_count}++; - $alignment_strand = '-'; - $read_conversion_info = 'GA'; - $genome_conversion = 'CT'; - - ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 - $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 - ## reverse complement! - $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); - } - else{ - $non_bisulfite_sequence = ''; - } - } - - ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) - elsif ($pbat_index == 3){ - ### [Index 3, sequence originated from complementary to (converted) reverse strand] - $counting{GA_GA_count}++; - $alignment_strand = '+'; - $read_conversion_info = 'GA'; - $genome_conversion = 'GA'; - - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > - ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand - $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 - } - else{ - $non_bisulfite_sequence = ''; - } - } - else{ - die "Too many bowtie result filehandles\n"; - } - - $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; - $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; - $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; - - ### at this point we can also determine the end position of a read - $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); -} - - -sub extract_corresponding_genomic_sequence_single_end_bowtie2{ - my ($sequence_identifier,$methylation_call_params) = @_; - - my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info}; - my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR}; - - ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the - ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call - - ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, - ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions - my $alignment_strand; - my $read_conversion_info; - my $genome_conversion; - ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and - ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence - my $non_bisulfite_sequence = ''; - - ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings - my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1; - - # parsing CIGAR string - my @len = split (/\D+/,$cigar); # storing the length per operation - my @ops = split (/\d+/,$cigar); # storing the operation - shift @ops; # remove the empty first element - die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); - - ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3) - if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; - return; - } - $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2); - } - my $indels = 0; - - foreach (0..$#len){ - if ($ops[$_] eq 'M'){ - #extracting genomic sequence - $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); - # adjusting position - $pos += $len[$_]; - } - elsif ($ops[$_] eq 'I'){ # insertion in the read sequence - # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls - $non_bisulfite_sequence .= 'N' x $len[$_]; - # warn "$non_bisulfite_sequence\n"; - # position doesn't need to be adjusting - $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) - } - elsif ($ops[$_] eq 'D'){ # deletion in the read sequence - # we do not add any genomic sequence but only adjust the position - $pos += $len[$_]; - $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) - } - elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die - die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; - } - else{ - die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; - } - } - - ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2) - if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ - ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome - unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; - return; - } - $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2); - # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n"; - } - - - - ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) - if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ - ### [Index 0, sequence originated from (converted) forward strand] - $counting{CT_CT_count}++; - $alignment_strand = '+'; - $read_conversion_info = 'CT'; - $genome_conversion = 'CT'; - } - - ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ - ### [Index 1, sequence originated from (converted) reverse strand] - $counting{CT_GA_count}++; - $alignment_strand = '-'; - $read_conversion_info = 'CT'; - $genome_conversion = 'GA'; - - ### reverse complement! - $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); - } - - ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ - ### [Index 2, sequence originated from complementary to (converted) forward strand] - $counting{GA_CT_count}++; - $alignment_strand = '-'; - $read_conversion_info = 'GA'; - $genome_conversion = 'CT'; - - ### reverse complement! - $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); - } - - ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) - elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ - ### [Index 3, sequence originated from complementary to (converted) reverse strand] - $counting{GA_GA_count}++; - $alignment_strand = '+'; - $read_conversion_info = 'GA'; - $genome_conversion = 'GA'; - - } - else{ - die "Too many Bowtie 2 result filehandles\n"; - } - - $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; - $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; - $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; - $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; - - ### the end position of a read is stored in $pos - $methylation_call_params->{$sequence_identifier}->{end_position} = $pos; - $methylation_call_params->{$sequence_identifier}->{indels} = $indels; -} - -### METHYLATION CALL - -sub methylation_call{ - my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_; - ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one - my @seq = split(//,$sequence_actually_observed); - my @genomic = split(//,$genomic_sequence); - # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n"; - ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either - ### CpG, CHH or CHG context - - ################################################################# - ### . for bases not involving cytosines ### - ### X for methylated C in CHG context (was protected) ### - ### x for not methylated C in CHG context (was converted) ### - ### H for methylated C in CHH context (was protected) ### - ### h for not methylated C in CHH context (was converted) ### - ### Z for methylated C in CpG context (was protected) ### - ### z for not methylated C in CpG context (was converted) ### - ### U for methylated C in unknown context (was protected) ### - ### u for not methylated C in unknwon context (was converted) ### - ################################################################# - - my @match =(); - warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 - my $methyl_CHH_count = 0; - my $methyl_CHG_count = 0; - my $methyl_CpG_count = 0; - my $methyl_C_unknown_count = 0; - my $unmethylated_CHH_count = 0; - my $unmethylated_CHG_count = 0; - my $unmethylated_CpG_count = 0; - my $unmethylated_C_unknown_count = 0; - - if ($read_conversion eq 'CT'){ - for my $index (0..$#seq) { - if ($seq[$index] eq $genomic[$index]) { - ### The residue can only be a C if it was not converted to T, i.e. protected my methylation - if ($genomic[$index] eq 'C') { - ### If the residue is a C we want to know if it was in CpG context or in any other context - my $downstream_base = $genomic[$index+1]; - - if ($downstream_base eq 'G'){ - ++$methyl_CpG_count; - push @match,'Z'; # protected C, methylated, in CpG context - } - elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) - ++$methyl_C_unknown_count; - push @match,'U'; # protected C, methylated, in Unknown context - } - else { - ### C in not in CpG-context, determining the second downstream base context - my $second_downstream_base = $genomic[$index+2]; - - if ($second_downstream_base eq 'G'){ - ++$methyl_CHG_count; - push @match,'X'; # protected C, methylated, in CHG context - } - elsif ($second_downstream_base eq 'N'){ - ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) - push @match,'U'; # protected C, methylated, in Unknown context - } - else{ - ++$methyl_CHH_count; - push @match,'H'; # protected C, methylated, in CHH context - } - } - } - else { - push @match, '.'; - } - } - elsif ($seq[$index] ne $genomic[$index]) { - ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts - ### in the actually observed sequence - if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') { - ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context - my $downstream_base = $genomic[$index+1]; - - if ($downstream_base eq 'G'){ - ++$unmethylated_CpG_count; - push @match,'z'; # converted C, not methylated, in CpG context - } - elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) - ++$unmethylated_C_unknown_count; - push @match,'u'; # converted C, not methylated, in Unknown context - } - else{ - ### C in not in CpG-context, determining the second downstream base context - my $second_downstream_base = $genomic[$index+2]; - - if ($second_downstream_base eq 'G'){ - ++$unmethylated_CHG_count; - push @match,'x'; # converted C, not methylated, in CHG context - } - elsif ($second_downstream_base eq 'N'){ - ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) - push @match,'u'; # converted C, not methylated, in Unknown context - } - else{ - ++$unmethylated_CHH_count; - push @match,'h'; # converted C, not methylated, in CHH context - } - } - } - ### all other mismatches are not of interest for a methylation call - else { - push @match,'.'; - } - } - else{ - die "There can be only 2 possibilities\n"; - } - } - } - elsif ($read_conversion eq 'GA'){ - # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n"; - - for my $index (0..$#seq) { - if ($seq[$index] eq $genomic[$index+2]) { - ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation - if ($genomic[$index+2] eq 'G') { - ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need - ### to look if the base upstream is a C - - my $upstream_base = $genomic[$index+1]; - - if ($upstream_base eq 'C'){ - ++$methyl_CpG_count; - push @match,'Z'; # protected C on opposing strand, methylated, in CpG context - } - elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) - ++$methyl_C_unknown_count; - push @match,'U'; # protected C on opposing strand, methylated, in Unknown context - } - else{ - ### C in not in CpG-context, determining the second upstream base context - my $second_upstream_base = $genomic[$index]; - - if ($second_upstream_base eq 'C'){ - ++$methyl_CHG_count; - push @match,'X'; # protected C on opposing strand, methylated, in CHG context - } - elsif ($second_upstream_base eq 'N'){ - ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) - push @match,'U'; # protected C, methylated, in Unknown context - } - else{ - ++$methyl_CHH_count; - push @match,'H'; # protected C on opposing strand, methylated, in CHH context - } - } - } - else{ - push @match, '.'; - } - } - elsif ($seq[$index] ne $genomic[$index+2]) { - ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts - ### on the opposing strand, so G to A conversions in the actually observed sequence - if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') { - ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if - ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream! - - my $upstream_base = $genomic[$index+1]; - - if ($upstream_base eq 'C'){ - ++$unmethylated_CpG_count; - push @match,'z'; # converted C on opposing strand, not methylated, in CpG context - } - elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) - ++$unmethylated_C_unknown_count; - push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context - } - else{ - ### C in not in CpG-context, determining the second upstream base context - my $second_upstream_base = $genomic[$index]; - - if ($second_upstream_base eq 'C'){ - ++$unmethylated_CHG_count; - push @match,'x'; # converted C on opposing strand, not methylated, in CHG context - } - elsif ($second_upstream_base eq 'N'){ - ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) - push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context - } - else{ - ++$unmethylated_CHH_count; - push @match,'h'; # converted C on opposing strand, not methylated, in CHH context - } - } - } - ### all other mismatches are not of interest for a methylation call - else { - push @match,'.'; - } - } - else{ - die "There can be only 2 possibilities\n"; - } - } - } - else{ - die "Strand conversion info is required to perform a methylation call\n"; - } - - my $methylation_call = join ("",@match); - - $counting{total_meCHH_count} += $methyl_CHH_count; - $counting{total_meCHG_count} += $methyl_CHG_count; - $counting{total_meCpG_count} += $methyl_CpG_count; - $counting{total_meC_unknown_count} += $methyl_C_unknown_count; - $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; - $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; - $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; - $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; - - # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; - return $methylation_call; -} - -sub read_genome_into_memory{ - ## working directoy - my $cwd = shift; - ## reading in and storing the specified genome in the %chromosomes hash - chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; - print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; - - my @chromosome_filenames = <*.fa>; - - ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta - unless (@chromosome_filenames){ - @chromosome_filenames = <*.fasta>; - } - - unless (@chromosome_filenames){ - die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; - } - - foreach my $chromosome_filename (@chromosome_filenames){ - - open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; - ### first line needs to be a fastA header - my $first_line = ; - chomp $first_line; - $first_line =~ s/\r//; - - ### Extracting chromosome name from the FastA header - my $chromosome_name = extract_chromosome_name($first_line); - - my $sequence; - while (){ - chomp; - $_ =~ s/\r//; - if ($_ =~ /^>/){ - ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) - if (exists $chromosomes{$chromosome_name}){ - print "chr $chromosome_name (",length $sequence ," bp)\n"; - die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; - } - else { - if (length($sequence) == 0){ - warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; - } - print "chr $chromosome_name (",length $sequence ," bp)\n"; - $chromosomes{$chromosome_name} = $sequence; - } - ### resetting the sequence variable - $sequence = ''; - ### setting new chromosome name - $chromosome_name = extract_chromosome_name($_); - } - else{ - $sequence .= uc$_; - } - } - - if (exists $chromosomes{$chromosome_name}){ - print "chr $chromosome_name (",length $sequence ," bp)\t"; - die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; - } - else{ - if (length($sequence) == 0){ - warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; - } - print "chr $chromosome_name (",length $sequence ," bp)\n"; - $chromosomes{$chromosome_name} = $sequence; - } - } - print "\n"; - chdir $cwd or die "Failed to move to directory $cwd\n"; -} - -sub extract_chromosome_name { - ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well - my $fasta_header = shift; - if ($fasta_header =~ s/^>//){ - my ($chromosome_name) = split (/\s+/,$fasta_header); - return $chromosome_name; - } - else{ - die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; - } -} - -sub reverse_complement{ - my $sequence = shift; - $sequence =~ tr/CATG/GTAC/; - $sequence = reverse($sequence); - return $sequence; -} - -sub biTransformFastAFiles { - my $file = shift; - my ($dir,$filename); - if ($file =~ /\//){ - ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; - } - else{ - $filename = $file; - } - - ### gzipped version of the infile - if ($file =~ /\.gz$/){ - open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; - } - else{ - open (IN,$file) or die "Couldn't read from file $file: $!\n"; - } - - if ($skip){ - warn "Skipping the first $skip reads from $file\n"; - sleep (1); - } - if ($upto){ - warn "Processing reads up to sequence no. $upto from $file\n"; - sleep (1); - } - - my $C_to_T_infile = my $G_to_A_infile = $filename; - - if ($gzip){ - $C_to_T_infile =~ s/$/_C_to_T.fa.gz/; - $G_to_A_infile =~ s/$/_G_to_A.fa.gz/; - } - else{ - $C_to_T_infile =~ s/$/_C_to_T.fa/; - $G_to_A_infile =~ s/$/_G_to_A.fa/; - } - - if ($prefix){ - # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; - $C_to_T_infile = "$prefix.$C_to_T_infile"; - $G_to_A_infile = "$prefix.$G_to_A_infile"; - # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; - } - - warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; - - if ($gzip){ - open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; - } - else{ - open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; - } - - unless ($directional){ - warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; - if ($gzip){ - open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; - } - else{ - open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; - } - } - - my $count = 0; - - while (1){ - my $header = ; - my $sequence= ; - last unless ($header and $sequence); - - $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces - - ++$count; - - if ($skip){ - next unless ($count > $skip); - } - if ($upto){ - last if ($count > $upto); - } - - $sequence = uc$sequence; # make input file case insensitive - - # detecting if the input file contains tab stops, as this is likely to result in no alignments - if (index($header,"\t") != -1){ - $seqID_contains_tabs++; - } - - ### small check if the sequence seems to be in FastA format - die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); - - my $sequence_C_to_T = $sequence; - $sequence_C_to_T =~ tr/C/T/; - print CTOT "$header$sequence_C_to_T"; - - unless ($directional){ - my $sequence_G_to_A = $sequence; - $sequence_G_to_A =~ tr/G/A/; - print GTOA "$header$sequence_G_to_A"; - } - } - close CTOT or die "Failed to close filehandle $!\n"; - - if ($directional){ - warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n"; - } - else{ - close GTOA or die "Failed to close filehandle $!\n"; - warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; - } - return ($C_to_T_infile,$G_to_A_infile); -} - -sub biTransformFastAFiles_paired_end { - my ($file,$read_number) = @_; - - if ($gzip){ - warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n"; - sleep (2); - } - - my ($dir,$filename); - if ($file =~ /\//){ - ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; - } - else{ - $filename = $file; - } - - ### gzipped version of the infile - if ($file =~ /\.gz$/){ - open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; - } - else{ - open (IN,$file) or die "Couldn't read from file $file: $!\n"; - } - - if ($skip){ - warn "Skipping the first $skip reads from $file\n"; - sleep (1); - } - if ($upto){ - warn "Processing reads up to sequence no. $upto from $file\n"; - sleep (1); - } - - my $C_to_T_infile = my $G_to_A_infile = $filename; - - $C_to_T_infile =~ s/$/_C_to_T.fa/; - $G_to_A_infile =~ s/$/_G_to_A.fa/; - - if ($prefix){ - # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; - $C_to_T_infile = "$prefix.$C_to_T_infile"; - $G_to_A_infile = "$prefix.$G_to_A_infile"; - # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; - } - - if ($directional){ - if ($read_number == 1){ - warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; - open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; - } - elsif ($read_number == 2){ - warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; - open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; - } - else{ - die "Read number needs to be 1 or 2, but was: $read_number\n\n"; - } - } - else{ # all four strand output - warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; - warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; - open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; - open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; - } - - my $count = 0; - - while (1){ - my $header = ; - my $sequence= ; - last unless ($header and $sequence); - - $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces - - ++$count; - - if ($skip){ - next unless ($count > $skip); - } - if ($upto){ - last if ($count > $upto); - } - - $sequence = uc$sequence; # make input file case insensitive - - # detecting if the input file contains tab stops, as this is likely to result in no alignments - if (index($header,"\t") != -1){ - $seqID_contains_tabs++; - } - - ## small check if the sequence seems to be in FastA format - die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/); - - if ($read_number == 1){ - if ($bowtie2){ - $header =~ s/$/\/1\/1/; - } - else{ - $header =~ s/$/\/1/; - } - } - elsif ($read_number == 2){ - if ($bowtie2){ - $header =~ s/$/\/2\/2/; - } - else{ - $header =~ s/$/\/2/; - } - } - else{ - die "Read number needs to be 1 or 2, but was: $read_number\n\n"; - } - my $sequence_C_to_T = my $sequence_G_to_A = $sequence; - - $sequence_C_to_T =~ tr/C/T/; - $sequence_G_to_A =~ tr/G/A/; - - if ($directional){ - - if ($read_number == 1){ - print CTOT "$header$sequence_C_to_T"; - } - elsif ($read_number == 2){ - print GTOA "$header$sequence_G_to_A"; - } - } - else{ - print CTOT "$header$sequence_C_to_T"; - print GTOA "$header$sequence_G_to_A"; - } - } - - if ($directional){ - if ($read_number == 1){ - warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n"; - } - else{ - warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n"; - } - } - else{ - warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; - } - - if ($directional){ - if ($read_number == 1){ - return ($C_to_T_infile); - } - else{ - return ($G_to_A_infile); - } - } - else{ - return ($C_to_T_infile,$G_to_A_infile); - } -} - - -sub biTransformFastQFiles { - my $file = shift; - my ($dir,$filename); - if ($file =~ /\//){ - ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; - } - else{ - $filename = $file; - } - - ### gzipped version of the infile - if ($file =~ /\.gz$/){ - open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; - } - else{ - open (IN,$file) or die "Couldn't read from file $file: $!\n"; - } - - if ($skip){ - warn "Skipping the first $skip reads from $file\n"; - sleep (1); - } - if ($upto){ - warn "Processing reads up to sequence no. $upto from $file\n"; - sleep (1); - } - - my $C_to_T_infile = my $G_to_A_infile = $filename; - - if ($prefix){ - # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; - $C_to_T_infile = "$prefix.$C_to_T_infile"; - $G_to_A_infile = "$prefix.$G_to_A_infile"; - # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; - } - - if ($pbat){ # PBAT-Seq - if ($gzip){ - $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; - } - else{ - $G_to_A_infile =~ s/$/_G_to_A.fastq/; - } - - warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; - - if ($gzip){ - open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; - } - else{ - open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; - } - } - else{ # directional or non-directional - if ($gzip){ - $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; - } - else{ - $C_to_T_infile =~ s/$/_C_to_T.fastq/; - } - - warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; - - if ($gzip){ - open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; - } - else{ - open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option - } - - unless ($directional){ - if ($gzip){ - $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; - } - else{ - $G_to_A_infile =~ s/$/_G_to_A.fastq/; - } - - warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; - - if ($gzip){ - open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; - } - else{ - open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; - } - } - } - - my $count = 0; - while (1){ - my $identifier = ; - my $sequence = ; - my $identifier2 = ; - my $quality_score = ; - last unless ($identifier and $sequence and $identifier2 and $quality_score); - - $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces - - ++$count; - - if ($skip){ - next unless ($count > $skip); - } - if ($upto){ - last if ($count > $upto); - } - - $sequence = uc$sequence; # make input file case insensitive - - # detecting if the input file contains tab stops, as this is likely to result in no alignments - if (index($identifier,"\t") != -1){ - $seqID_contains_tabs++; - } - - ## small check if the sequence file appears to be a FastQ file - if ($count == 1){ - if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ - die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; - } - } - - if ($pbat){ - my $sequence_G_to_A = $sequence; - $sequence_G_to_A =~ tr/G/A/; - print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); - } - else{ # directional or non-directional - my $sequence_C_to_T = $sequence; - $sequence_C_to_T =~ tr/C/T/; - print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); - - unless ($directional){ - my $sequence_G_to_A = $sequence; - $sequence_G_to_A =~ tr/G/A/; - print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); - } - } - } - - if ($directional){ - close CTOT or die "Failed to close filehandle $!\n"; - warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; - } - elsif($pbat){ - warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; - close GTOA or die "Failed to close filehandle $!\n"; - return ($G_to_A_infile); - } - else{ - close CTOT or die "Failed to close filehandle $!\n"; - close GTOA or die "Failed to close filehandle $!\n"; - warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; - } - - return ($C_to_T_infile,$G_to_A_infile); -} - -sub biTransformFastQFiles_paired_end { - my ($file,$read_number) = @_; - my ($dir,$filename); - - if ($file =~ /\//){ - ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; - } - else{ - $filename = $file; - } - - ### gzipped version of the infile - if ($file =~ /\.gz$/){ - open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; - } - else{ - open (IN,$file) or die "Couldn't read from file $file: $!\n"; - } - - if ($skip){ - warn "Skipping the first $skip reads from $file\n"; - sleep (1); - } - if ($upto){ - warn "Processing reads up to sequence no. $upto from $file\n"; - sleep (1); - } - - my $C_to_T_infile = my $G_to_A_infile = $filename; - - if ($gzip){ - $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; - $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; - } - else{ - $C_to_T_infile =~ s/$/_C_to_T.fastq/; - $G_to_A_infile =~ s/$/_G_to_A.fastq/; - } - - if ($prefix){ - # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; - $C_to_T_infile = "$prefix.$C_to_T_infile"; - $G_to_A_infile = "$prefix.$G_to_A_infile"; - # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; - } - - if ($directional){ - if ($read_number == 1){ - warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; - if ($gzip){ - open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; - } - else{ - open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; - } - } - elsif ($read_number == 2){ - warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; - if ($gzip){ - open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; - } - else{ - open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; - } - } - else{ - die "Read number needs to be 1 or 2, but was $read_number!\n\n"; - } - } - else{ - warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; - warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; - if ($gzip){ - open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; - open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; - } - else{ - open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; - open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; - } - } - - my $count = 0; - while (1){ - my $identifier = ; - my $sequence = ; - my $identifier2 = ; - my $quality_score = ; - last unless ($identifier and $sequence and $identifier2 and $quality_score); - ++$count; - - $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces - - if ($skip){ - next unless ($count > $skip); - } - if ($upto){ - last if ($count > $upto); - } - - $sequence= uc$sequence; # make input file case insensitive - - ## small check if the sequence file appears to be a FastQ file - if ($count == 1){ - if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ - die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; - } - } - my $sequence_C_to_T = my $sequence_G_to_A = $sequence; - - if ($read_number == 1){ - if ($bowtie2){ - $identifier =~ s/$/\/1\/1/; - } - else{ - $identifier =~ s/$/\/1/; - } - } - elsif ($read_number == 2){ - if ($bowtie2){ - $identifier =~ s/$/\/2\/2/; - } - else{ - $identifier =~ s/$/\/2/; - } - } - else{ - die "Read number needs to be 1 or 2\n"; - } - - $sequence_C_to_T =~ tr/C/T/; - $sequence_G_to_A =~ tr/G/A/; - - if ($directional){ - if ($read_number == 1){ - print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); - } - else{ - print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); - } - } - else{ - print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); - print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); - } - } - - if ($directional){ - if ($read_number == 1){ - warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; - } - else{ - warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; - } - } - else{ - warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; - } - if ($directional){ - if ($read_number == 1){ - close CTOT or die "Failed to close filehandle $!\n"; - return ($C_to_T_infile); - } - else{ - close GTOA or die "Failed to close filehandle $!\n"; - return ($G_to_A_infile); - } - } - else{ - close CTOT or die "Failed to close filehandle $!\n"; - close GTOA or die "Failed to close filehandle $!\n"; - return ($C_to_T_infile,$G_to_A_infile); - } -} - - -### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES - -sub biTransformFastQFiles_paired_end_bowtie1_gzip { - my ($file_1,$file_2) = @_; - my ($dir,$filename); - - if ($file_1 =~ /\//){ - ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/; - } - else{ - $filename = $file_1; - } - - ### gzipped version of infile 1 - if ($file_1 =~ /\.gz$/){ - open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n"; - } - else{ - open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n"; - } - ### gzipped version of infile 2 - if ($file_2 =~ /\.gz$/){ - open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n"; - } - else{ - open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n"; - } - - - if ($skip){ - warn "Skipping the first $skip reads from $file_1 and $file_2\n"; - sleep (1); - } - if ($upto){ - warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n"; - sleep (1); - } - - my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; - - if ($prefix){ - # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; - $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; - $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; - # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; - } - - $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; - $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; - # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; - - warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; - open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; - # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; - - unless ($directional){ - print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n"; - open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n"; - } - - ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format: - ### - - my $count = 0; - while (1){ - my $identifier_1 = ; - my $sequence_1 = ; - my $identifier2_1 = ; - my $quality_score_1 = ; - - my $identifier_2 = ; - my $sequence_2 = ; - my $identifier2_2 = ; - my $quality_score_2 = ; - - last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2); - - ++$count; - - ## small check if the sequence file appears to be a FastQ file - if ($count == 1){ - if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){ - die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n"; - } - if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){ - die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n"; - } - } - - $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces - chomp $identifier_1; - chomp $sequence_1; - chomp $sequence_2; - chomp $quality_score_1; - chomp $quality_score_2; - - $identifier_1 =~ s/^\@//; - $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever) - - if ($skip){ - next unless ($count > $skip); - } - if ($upto){ - last if ($count > $upto); - } - - $sequence_1 = uc$sequence_1; # make input file 1 case insensitive - $sequence_2 = uc$sequence_2; # make input file 2 case insensitive - - # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n"; - my $sequence_1_C_to_T = $sequence_1; - my $sequence_2_G_to_A = $sequence_2; - $sequence_1_C_to_T =~ tr/C/T/; - $sequence_2_G_to_A =~ tr/G/A/; - - print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n"; - - unless ($directional){ - my $sequence_1_G_to_A = $sequence_1; - my $sequence_2_C_to_T = $sequence_2; - $sequence_1_G_to_A =~ tr/G/A/; - $sequence_2_C_to_T =~ tr/C/T/; - print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n"; - } - } - - close CTPLUSGA or die "Couldn't close filehandle\n"; - warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n"; - - if ($directional){ - warn "\n"; - return ($CT_plus_GA_infile); - } - else{ - close GAPLUSCT or die "Couldn't close filehandle\n"; - warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n"; - return ($CT_plus_GA_infile,$GA_plus_CT_infile); - } -} - - -sub fix_IDs{ - my $id = shift; - $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores - return $id; -} - -sub ensure_sensical_alignment_orientation_single_end{ - my $index = shift; # index number if the sequence produced an alignment - my $strand = shift; - ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one - my $orientation = 0; - ############################################################################################################## - ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T) - ## here we only want reads in the forward (+) orientation - if ($fhs[$index]->{name} eq 'CTreadCTgenome') { - ### if the alignment is (+) we count it, and return 1 for a correct orientation - if ($strand eq '+') { - $fhs[$index]->{seen}++; - $orientation = 1; - return $orientation; - } - ### if the orientation equals (-) the alignment is nonsensical - elsif ($strand eq '-') { - $fhs[$index]->{wrong_strand}++; - return $orientation; - } - } - ############################################################################################################### - ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A) - ## here we only want reads in the forward (-) orientation - elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') { - ### if the alignment is (-) we count it and return 1 for a correct orientation - if ($strand eq '-') { - $fhs[$index]->{seen}++; - $orientation = 1; - return $orientation; - } - ### if the orientation equals (+) the alignment is nonsensical - elsif ($strand eq '+') { - $fhs[$index]->{wrong_strand}++; - return $orientation; - } - } - ############################################################################################################### - ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T) - ## here we only want reads in the forward (-) orientation - elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') { - ### if the alignment is (-) we count it and return 1 for a correct orientation - if ($strand eq '-') { - $fhs[$index]->{seen}++; - $orientation = 1; - return $orientation; - } - ### if the orientation equals (+) the alignment is nonsensical - elsif ($strand eq '+') { - $fhs[$index]->{wrong_strand}++; - return $orientation; - } - } - ############################################################################################################### - ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A) - ## here we only want reads in the forward (+) orientation - elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') { - ### if the alignment is (+) we count it and return 1 for a correct orientation - if ($strand eq '+') { - $fhs[$index]->{seen}++; - $orientation = 1; - return $orientation; - } - ### if the orientation equals (-) the alignment is nonsensical - elsif ($strand eq '-') { - $fhs[$index]->{wrong_strand}++; - return $orientation; - } - } else{ - die "One of the above conditions must be true\n"; - } -} - -sub ensure_sensical_alignment_orientation_paired_ends{ - my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment - ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one - my $orientation = 0; - ############################################################################################################## - ## [Index 0, sequence originated from (converted) forward strand] - ## CT converted read 1 - ## GA converted read 2 - ## CT converted genome - ## here we only want read 1 in (+) orientation and read 2 in (-) orientation - if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') { - ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation - if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { - $fhs[$index]->{seen}++; - $orientation = 1; - return $orientation; - } - ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical - elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { - $fhs[$index]->{wrong_strand}++; - return $orientation; - } - else{ - die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; - } - } - ############################################################################################################### - ## [Index 1, sequence originated from (converted) reverse strand] - ## GA converted read 1 - ## CT converted read 2 - ## GA converted genome - ## here we only want read 1 in (+) orientation and read 2 in (-) orientation - elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') { - ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation - if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { - $fhs[$index]->{seen}++; - $orientation = 1; - return $orientation; - } - ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical - elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { - $fhs[$index]->{wrong_strand}++; - return $orientation; - } - else{ - die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; - } - } - ############################################################################################################### - ## [Index 2, sequence originated from complementary to (converted) forward strand] - ## GA converted read 1 - ## CT converted read 2 - ## CT converted genome - ## here we only want read 1 in (-) orientation and read 2 in (+) orientation - elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') { - ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation - if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { - $fhs[$index]->{seen}++; - $orientation = 1; - return $orientation; - } - ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical - elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { - $fhs[$index]->{wrong_strand}++; - return $orientation; - } - else{ - die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; - } - } - ############################################################################################################### - ## [Index 3, sequence originated from complementary to (converted) reverse strand] - ## CT converted read 1 - ## GA converted read 2 - ## GA converted genome - ## here we only want read 1 in (+) orientation and read 2 in (-) orientation - elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') { - ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation - if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { - $fhs[$index]->{seen}++; - $orientation = 1; - return $orientation; - } - ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical - elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { - $fhs[$index]->{wrong_strand}++; - return $orientation; - } - else{ - die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; - } - } - else{ - die "One of the above conditions must be true\n"; - } -} - -##################################################################################################################################################### - -### Bowtie 1 (default) | PAIRED-END | FASTA - -sub paired_end_align_fragments_to_bisulfite_genome_fastA { - - my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; - - if ($directional){ - warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; - } - else{ - warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; - } - - ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the - ## data structure above - if ($directional){ - warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - else{ - warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - - foreach my $fh (@fhs) { - - if ($directional){ - unless ($fh->{inputfile_1}){ - $fh->{last_seq_id} = undef; - $fh->{last_line_1} = undef; - $fh->{last_line_2} = undef; - next; - } - } - - my $bt_options = $bowtie_options; - if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ - $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner - } - else { - $bt_options .= ' --nofw'; - } - - warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n"; - open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; - - my $line_1 = $fh->{fh}->getline(); - my $line_2 = $fh->{fh}->getline(); - - # if Bowtie produces an alignment we store the first line of the output - if ($line_1 and $line_2) { - chomp $line_1; - chomp $line_2; - my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) - my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line - - ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. - ### We will thus identify which sequence was read 1 and store this ID as last_seq_id - - if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present - $fh->{last_seq_id} = $id_1; - } - elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present - $fh->{last_seq_id} = $id_2; - } - else{ - die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; - } - - $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 - $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 - warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; - } - # otherwise we just initialise last_seq_id and last_lines as undefined - else { - warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; - $fh->{last_seq_id} = undef; - $fh->{last_line_1} = undef; - $fh->{last_line_2} = undef; - } - } -} - -### Bowtie 2 | PAIRED-END | FASTA - -sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { - my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; - if ($directional){ - warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; - } - else{ - warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; - } - - ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the - ## data structure above - if ($directional){ - warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - else{ - warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - - foreach my $fh (@fhs) { - - if ($directional){ - unless ($fh->{inputfile_1}){ - $fh->{last_seq_id} = undef; - $fh->{last_line_1} = undef; - $fh->{last_line_2} = undef; - next; - } - } - - my $bt2_options = $bowtie_options; - if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ - $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner - } - else { - $bt2_options .= ' --nofw'; - } - - warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; - open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; - - ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence - while (1){ - $_ = $fh->{fh}->getline(); - if ($_) { - last unless ($_ =~ /^\@/); # SAM headers start with @ - } - else{ - last; # no alignment output - } - } - - my $line_1 = $_; - my $line_2 = $fh->{fh}->getline(); - - # if Bowtie produces an alignment we store the first line of the output - if ($line_1 and $line_2) { - chomp $line_1; - chomp $line_2; - my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) - my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line - - ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. - ### We will thus identify which sequence was read 1 and store this ID as last_seq_id - - if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with - $fh->{last_seq_id} = $id_1; - } - elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present - $fh->{last_seq_id} = $id_2; - } - else{ - warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; - } - - $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 - $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 - warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; - } - # otherwise we just initialise last_seq_id and last_lines as undefined - else { - warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; - $fh->{last_seq_id} = undef; - $fh->{last_line_1} = undef; - $fh->{last_line_2} = undef; - } - } -} - -### Bowtie 1 (default) | PAIRED-END | FASTQ - -sub paired_end_align_fragments_to_bisulfite_genome_fastQ { - my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; - - if ($directional){ - warn "Input file is $C_to_T_infile_1 (FastQ)\n"; - } - elsif($pbat){ - warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n"; - } - else{ - warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n"; - } - - ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the - ## data structure above - if ($directional or $pbat){ - warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - else{ - warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - - foreach my $fh (@fhs) { - - if ($directional or $pbat){ - unless ($fh->{inputfile_1}){ - $fh->{last_seq_id} = undef; - $fh->{last_line_1} = undef; - $fh->{last_line_2} = undef; - next; # skipping unwanted filehandles - } - } - - my $bt_options = $bowtie_options; - if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ - $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner - } - else { - $bt_options .= ' --nofw'; - } - - if ($gzip){ - warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n"; - open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!"; - } - else{ - warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n"; - sleep(5); - open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; - } - - my $line_1 = $fh->{fh}->getline(); - my $line_2 = $fh->{fh}->getline(); - - # if Bowtie produces an alignment we store the first line of the output - if ($line_1 and $line_2) { - chomp $line_1; - chomp $line_2; - ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. - ### We will thus identify which sequence was read 1 and store this ID as last_seq_id - - my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) - my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line - - if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present - $fh->{last_seq_id} = $id_1; - } - elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present - $fh->{last_seq_id} = $id_2; - } - else{ - die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; - } - - $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 - $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 - warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; - } - - # otherwise we just initialise last_seq_id and last_lines as undefined - else { - warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; - $fh->{last_seq_id} = undef; - $fh->{last_line_1} = undef; - $fh->{last_line_2} = undef; - } - } -} - -### Bowtie 2 | PAIRED-END | FASTQ - -sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { - my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; - if ($directional){ - warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; - } - else{ - warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; - } - - ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the - ## data structure above - if ($directional){ - warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - else{ - warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - - foreach my $fh (@fhs) { - - if ($directional){ - unless ($fh->{inputfile_1}){ - $fh->{last_seq_id} = undef; - $fh->{last_line_1} = undef; - $fh->{last_line_2} = undef; - next; - } - } - - my $bt2_options = $bowtie_options; - if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ - $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner - } - else { - $bt2_options .= ' --nofw'; - } - - warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; - open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; - - ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence - while (1){ - $_ = $fh->{fh}->getline(); - if ($_) { - last unless ($_ =~ /^\@/); # SAM headers start with @ - } - else{ - last; # no alignment output - } - } - - my $line_1 = $_; - my $line_2 = $fh->{fh}->getline(); - - # if Bowtie produces an alignment we store the first line of the output - if ($line_1 and $line_2) { - chomp $line_1; - chomp $line_2; - ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. - ### We will thus identify which sequence was read 1 and store this ID as last_seq_id - - my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) - my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line - - if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with - $fh->{last_seq_id} = $id_1; - } - elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present - $fh->{last_seq_id} = $id_2; - } - else{ - die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; - } - - $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 - $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 - warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; - } - - # otherwise we just initialise last_seq_id and last_lines as undefined - else { - warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; - $fh->{last_seq_id} = undef; - $fh->{last_line_1} = undef; - $fh->{last_line_2} = undef; - } - } -} - -##################################################################################################################################################### - -### Bowtie 1 (default) | SINGLE-END | FASTA -sub single_end_align_fragments_to_bisulfite_genome_fastA { - my ($C_to_T_infile,$G_to_A_infile) = @_; - if ($directional){ - warn "Input file is $C_to_T_infile (FastA)\n"; - } - else{ - warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; - } - - ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in - ## data structure above - if ($directional){ - warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - else{ - warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - - foreach my $fh (@fhs) { - - my $bt_options = $bowtie_options; - if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ - $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner - } - else { - $bt_options .= ' --nofw'; - } - - warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; - if ($gzip){ - open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; - } - else{ - open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data - } - - # if Bowtie produces an alignment we store the first line of the output - $_ = $fh->{fh}->getline(); - if ($_) { - chomp; - my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier) - $fh->{last_seq_id} = $id; - $fh->{last_line} = $_; - warn "Found first alignment:\t$fh->{last_line}\n"; - } - # otherwise we just initialise last_seq_id and last_line as undefined - else { - warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; - $fh->{last_seq_id} = undef; - $fh->{last_line} = undef; - } - } -} - -### Bowtie 2 | SINGLE-END | FASTA -sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { - my ($C_to_T_infile,$G_to_A_infile) = @_; - if ($directional){ - warn "Input file is $C_to_T_infile (FastA)\n"; - } - else{ - warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; - } - - ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in - ## data structure above - if ($directional){ - warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - else{ - warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - - foreach my $fh (@fhs) { - - my $bt2_options = $bowtie_options; - if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ - $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner - } - else { - $bt2_options .= ' --nofw'; - } - - warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n"; - open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; - - ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence - while (1){ - $_ = $fh->{fh}->getline(); - if ($_) { - last unless ($_ =~ /^\@/); # SAM headers start with @ - } - else{ - last; # no alignment output - } - } - - # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output - if ($_) { - chomp; - my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) - $fh->{last_seq_id} = $id; - $fh->{last_line} = $_; - warn "Found first alignment:\t$fh->{last_line}\n"; - } - # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output - else { - warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; - $fh->{last_seq_id} = undef; - $fh->{last_line} = undef; - } - } -} - - -### Bowtie 1 (default) | SINGLE-END | FASTQ -sub single_end_align_fragments_to_bisulfite_genome_fastQ { - my ($C_to_T_infile,$G_to_A_infile) = @_; - if ($directional){ - warn "Input file is $C_to_T_infile (FastQ)\n"; - } - elsif($pbat){ - warn "Input file is $G_to_A_infile (FastQ)\n"; - } - else{ - warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n"; - } - - - ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in - ## the data structure above - if ($directional or $pbat){ - warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - else{ - warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - - foreach my $fh (@fhs) { - my $bt_options = $bowtie_options; - if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ - $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner - } - else { - $bt_options .= ' --nofw'; - } - - warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; - sleep (5); - - if ($gzip){ - open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; - } - else{ - open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data - } - - # if Bowtie produces an alignment we store the first line of the output - $_ = $fh->{fh}->getline(); - if ($_) { - chomp; - my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) - $fh->{last_seq_id} = $id; - $fh->{last_line} = $_; - warn "Found first alignment:\t$fh->{last_line}\n"; - } - # otherwise we just initialise last_seq_id and last_line as undefined - else { - warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; - $fh->{last_seq_id} = undef; - $fh->{last_line} = undef; - } - } -} - -### Bowtie 2 | SINGLE-END | FASTQ -sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { - - my ($C_to_T_infile,$G_to_A_infile) = @_; - if ($directional){ - warn "Input file is $C_to_T_infile (FastQ)\n\n"; - } - else{ - warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n"; - } - - ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in - ## the data structure above - if ($directional){ - warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - else{ - warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; - } - foreach my $fh (@fhs) { - my $bt2_options = $bowtie_options; - if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ - $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner - } - else { - $bt2_options .= ' --nofw'; - } - warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n"; - warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n"; - - open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; - ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence - while (1){ - $_ = $fh->{fh}->getline(); - # warn "$_\n"; - # sleep(1); - if ($_) { - last unless ($_ =~ /^\@/); # SAM headers start with @ - } - else { - last; - } - } - - # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output - if ($_) { - chomp; - my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier) - $fh->{last_seq_id} = $id; - $fh->{last_line} = $_; - warn "Found first alignment:\t$fh->{last_line}\n"; - # warn "storing $id and\n$_\n"; - } - # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output - else { - warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; - $fh->{last_seq_id} = undef; - $fh->{last_line} = undef; - } - } -} - -########################################################################################################################################### - -sub reset_counters_and_fhs{ - my $filename = shift; - %counting=( - total_meCHH_count => 0, - total_meCHG_count => 0, - total_meCpG_count => 0, - total_meC_unknown_count => 0, - total_unmethylated_CHH_count => 0, - total_unmethylated_CHG_count => 0, - total_unmethylated_CpG_count => 0, - total_unmethylated_C_unknown_count => 0, - sequences_count => 0, - no_single_alignment_found => 0, - unsuitable_sequence_count => 0, - genomic_sequence_could_not_be_extracted_count => 0, - unique_best_alignment_count => 0, - low_complexity_alignments_overruled_count => 0, - CT_CT_count => 0, #(CT read/CT genome, original top strand) - CT_GA_count => 0, #(CT read/GA genome, original bottom strand) - GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand) - GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand) - CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand) - GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand) - GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand) - CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand) - alignments_rejected_count => 0, # only relevant if --directional was specified - ); - - if ($directional){ - if ($filename =~ ','){ # paired-end files - @fhs=( - { name => 'CTreadCTgenome', - strand_identity => 'con ori forward', - bisulfiteIndex => $CT_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'CTreadGAgenome', - strand_identity => 'con ori reverse', - bisulfiteIndex => $GA_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'GAreadCTgenome', - strand_identity => 'compl ori con forward', - bisulfiteIndex => $CT_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'GAreadGAgenome', - strand_identity => 'compl ori con reverse', - bisulfiteIndex => $GA_index_basename, - seen => 0, - wrong_strand => 0, - }, - ); - } - else{ # single-end files - @fhs=( - { name => 'CTreadCTgenome', - strand_identity => 'con ori forward', - bisulfiteIndex => $CT_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'CTreadGAgenome', - strand_identity => 'con ori reverse', - bisulfiteIndex => $GA_index_basename, - seen => 0, - wrong_strand => 0, - }, - ); - } - } - elsif($pbat){ - if ($filename =~ ','){ # paired-end files - @fhs=( - { name => 'CTreadCTgenome', - strand_identity => 'con ori forward', - bisulfiteIndex => $CT_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'CTreadGAgenome', - strand_identity => 'con ori reverse', - bisulfiteIndex => $GA_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'GAreadCTgenome', - strand_identity => 'compl ori con forward', - bisulfiteIndex => $CT_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'GAreadGAgenome', - strand_identity => 'compl ori con reverse', - bisulfiteIndex => $GA_index_basename, - seen => 0, - wrong_strand => 0, - }, - ); - } - else{ # single-end files - @fhs=( - { name => 'GAreadCTgenome', - strand_identity => 'compl ori con forward', - bisulfiteIndex => $CT_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'GAreadGAgenome', - strand_identity => 'compl ori con reverse', - bisulfiteIndex => $GA_index_basename, - seen => 0, - wrong_strand => 0, - }, - ); - } - } - else{ - @fhs=( - { name => 'CTreadCTgenome', - strand_identity => 'con ori forward', - bisulfiteIndex => $CT_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'CTreadGAgenome', - strand_identity => 'con ori reverse', - bisulfiteIndex => $GA_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'GAreadCTgenome', - strand_identity => 'compl ori con forward', - bisulfiteIndex => $CT_index_basename, - seen => 0, - wrong_strand => 0, - }, - { name => 'GAreadGAgenome', - strand_identity => 'compl ori con reverse', - bisulfiteIndex => $GA_index_basename, - seen => 0, - wrong_strand => 0, - }, - ); - } -} - - -sub process_command_line{ - my @bowtie_options; - my $help; - my $mates1; - my $mates2; - my $path_to_bowtie; - my $fastq; - my $fasta; - my $skip; - my $qupto; - my $phred64; - my $phred33; - my $solexa; - my $mismatches; - my $seed_length; - my $best; - my $sequence_format; - my $version; - my $quiet; - my $chunk; - my $non_directional; - my $ceiling; - my $maxins; - my $minins; - my $unmapped; - my $multi_map; - my $output_dir; - my $bowtie2; - my $vanilla; - my $sam_no_hd; - my $seed_extension_fails; - my $reseed_repetitive_seeds; - my $most_valid_alignments; - my $score_min; - my $parallel; - my $temp_dir; - my $rdg; - my $rfg; - my $non_bs_mm; - my $samtools_path; - my $bam; - my $gzip; - my $pbat; - my $prefix; - my $old_flag; - - my $command_line = GetOptions ('help|man' => \$help, - '1=s' => \$mates1, - '2=s' => \$mates2, - 'path_to_bowtie=s' => \$path_to_bowtie, - 'f|fasta' => \$fasta, - 'q|fastq' => \$fastq, - 's|skip=i' => \$skip, - 'u|upto=i' => \$qupto, - 'phred33-quals' => \$phred33, - 'phred64-quals|solexa1' => \$phred64, - 'solexa-quals' => \$solexa, - 'n|seedmms=i' => \$mismatches, - 'l|seedlen=i' => \$seed_length, - 'no_best' => \$best, - 'version' => \$version, - 'quiet' => \$quiet, - 'chunkmbs=i' => \$chunk, - 'non_directional' => \$non_directional, - 'I|minins=i' => \$minins, - 'X|maxins=i' => \$maxins, - 'e|maqerr=i' => \$ceiling, - 'un|unmapped' => \$unmapped, - 'ambiguous' => \$multi_map, - 'o|output_dir=s' => \$output_dir, - 'bowtie2' => \$bowtie2, - 'vanilla' => \$vanilla, - 'sam-no-hd' => \$sam_no_hd, - 'D=i' => \$seed_extension_fails, - 'R=i' => \$reseed_repetitive_seeds, - 'score_min=s' => \$score_min, - 'most_valid_alignments=i' => \$most_valid_alignments, - 'p=i' => \$parallel, - 'temp_dir=s' => \$temp_dir, - 'rdg=s' => \$rdg, - 'rfg=s' => \$rfg, - 'non_bs_mm' => \$non_bs_mm, - 'samtools_path=s' => \$samtools_path, - 'bam' => \$bam, - 'gzip' => \$gzip, - 'pbat' => \$pbat, - 'prefix=s' => \$prefix, - 'old_flag' => \$old_flag, - ); - - - ### EXIT ON ERROR if there were errors with any of the supplied options - unless ($command_line){ - die "Please respecify command line options\n"; - } - ### HELPFILE - if ($help){ - print_helpfile(); - exit; - } - if ($version){ - print << "VERSION"; - - - Bismark - Bisulfite Mapper and Methylation Caller. - - Bismark Version: $bismark_version - Copyright 2010-13 Felix Krueger, Babraham Bioinformatics - www.bioinformatics.babraham.ac.uk/projects/ - - -VERSION - exit; - } - - - ########################## - ### PROCESSING OPTIONS ### - ########################## - - unless ($bowtie2){ - $bowtie2 = 0; - } - unless ($sam_no_hd){ - $sam_no_hd =0; - } - - ### PATH TO BOWTIE - ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH - if ($path_to_bowtie){ - unless ($path_to_bowtie =~ /\/$/){ - $path_to_bowtie =~ s/$/\//; - } - if (-d $path_to_bowtie){ - if ($bowtie2){ - $path_to_bowtie = "${path_to_bowtie}bowtie2"; - } - else{ - $path_to_bowtie = "${path_to_bowtie}bowtie"; - } - } - else{ - die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n"; - } - } - else{ - if ($bowtie2){ - $path_to_bowtie = 'bowtie2'; - warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; } - else{ - $path_to_bowtie = 'bowtie'; - warn "Path to Bowtie specified as: $path_to_bowtie\n"; - } - } - - ### OUTPUT REQUESTED AS BAM FILE - if ($bam){ - if ($vanilla){ - die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n"; - } - - ### PATH TO SAMTOOLS - if (defined $samtools_path){ - # if Samtools was specified as full command - if ($samtools_path =~ /samtools$/){ - if (-e $samtools_path){ - # Samtools executable found - } - else{ - die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; - } - } - else{ - unless ($samtools_path =~ /\/$/){ - $samtools_path =~ s/$/\//; - } - $samtools_path .= 'samtools'; - if (-e $samtools_path){ - # Samtools executable found - } - else{ - die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; - } - } - - warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n"; - $bam = 1; - } - # Check whether Samtools is in the PATH if no path was supplied by the user - else{ - if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH - $samtools_path = `which samtools`; - chomp $samtools_path; - warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n"; - $bam = 1; - } - } - - unless (defined $samtools_path){ - $bam = 2; - warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n"; - } - sleep (1); - } - - - #################################### - ### PROCESSING ARGUMENTS - - ### GENOME FOLDER - my $genome_folder = shift @ARGV; # mandatory - unless ($genome_folder){ - warn "Genome folder was not specified!\n"; - print_helpfile(); - exit; - } - - ### checking that the genome folder, all subfolders and the required bowtie index files exist - unless ($genome_folder =~/\/$/){ - $genome_folder =~ s/$/\//; - } - - if (chdir $genome_folder){ - my $absolute_genome_folder = getcwd; ## making the genome folder path absolute - unless ($absolute_genome_folder =~/\/$/){ - $absolute_genome_folder =~ s/$/\//; - } - warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n"; - $genome_folder = $absolute_genome_folder; - } - else{ - die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] {-1 -2 | } [] (--help for more details)\n"; - } - - my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/"; - my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/"; - - if ($bowtie2){ ### Bowtie 2 (new) - ### checking the integrity of $CT_dir - chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; - my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); - foreach my $file(@CT_bowtie_index){ - unless (-f $file){ - die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n"; - } - } - ### checking the integrity of $GA_dir - chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; - my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); - foreach my $file(@GA_bowtie_index){ - unless (-f $file){ - die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n"; - } - } - } - - else{ ### Bowtie 1 (default) - ### checking the integrity of $CT_dir - chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; - my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); - foreach my $file(@CT_bowtie_index){ - unless (-f $file){ - die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; - } - } - ### checking the integrity of $GA_dir - chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; - my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); - foreach my $file(@GA_bowtie_index){ - unless (-f $file){ - die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; - } - } - } - - my $CT_index_basename = "${CT_dir}BS_CT"; - my $GA_index_basename = "${GA_dir}BS_GA"; - - ### INPUT OPTIONS - - ### SEQUENCE FILE FORMAT - ### exits if both fastA and FastQ were specified - if ($fasta and $fastq){ - die "Only one sequence filetype can be specified (fastA or fastQ)\n"; - } - - ### unless fastA is specified explicitely, fastQ sequence format is expected by default - if ($fasta){ - print "FastA format specified\n"; - $sequence_format = 'FASTA'; - push @bowtie_options, '-f'; - } - elsif ($fastq){ - print "FastQ format specified\n"; - $sequence_format = 'FASTQ'; - push @bowtie_options, '-q'; - } - else{ - $fastq = 1; - print "FastQ format assumed (by default)\n"; - $sequence_format = 'FASTQ'; - push @bowtie_options, '-q'; - } - - ### SKIP - if ($skip){ - warn "Skipping the first $skip reads from the input file\n"; - # push @bowtie_options,"-s $skip"; - } - - ### UPTO - if ($qupto){ - warn "Processing sequences up to read no. $qupto from the input file\n"; - if ($bowtie2){ - # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2 - } - else{ - # push @bowtie_options,"--qupto $qupto"; - } - } - - ### QUALITY VALUES - if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){ - die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)"; - } - if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2. - # Phred quality values work only when -q is specified - unless ($fastq){ - die "Phred quality values works only when -q (FASTQ) is specified\n"; - } - if ($bowtie2){ - push @bowtie_options,"--phred33"; - } - else{ - push @bowtie_options,"--phred33-quals"; - } - } - if ($phred64){ - # Phred quality values work only when -q is specified - unless ($fastq){ - die "Phred quality values work only when -q (FASTQ) is specified\n"; - } - if ($bowtie2){ - push @bowtie_options,"--phred64"; - } - else{ - push @bowtie_options,"--phred64-quals"; - } - } - else{ - $phred64 = 0; - } - - if ($solexa){ - if ($bowtie2){ - die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n"; - } - # Solexa to Phred value conversion works only when -q is specified - unless ($fastq){ - die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n"; - } - push @bowtie_options,"--solexa-quals"; - } - else{ - $solexa = 0; - } - - ### ALIGNMENT OPTIONS - - ### MISMATCHES - if (defined $mismatches){ - if ($bowtie2){ - if ($mismatches == 0 or $mismatches == 1){ - push @bowtie_options,"-N $mismatches"; - } - else{ - die "Please set the number of multiseed mismatches for Bowtie 2 with '-N ' (where can be 0 or 1)\n"; - } - } - else{ - if ($mismatches >= 0 and $mismatches <= 3){ - push @bowtie_options,"-n $mismatches"; - } - else{ - die "Please set the number of seed mismatches for Bowtie 1 with '-n ' (where can be 0,1,2 or 3)\n"; - } - } - } - else{ - unless ($bowtie2){ - push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2 - } - } - - ### SEED LENGTH - if (defined $seed_length){ - if ($bowtie2){ - push @bowtie_options,"-L $seed_length"; - } - else{ - push @bowtie_options,"-l $seed_length"; - } - } - - ### MISMATCH CEILING - if (defined $ceiling){ - die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2); - push @bowtie_options,"-e $ceiling"; - } - - - ### BOWTIE 2 EFFORT OPTIONS - - ### CONSECUTIVE SEED EXTENSION FAILS - if (defined $seed_extension_fails){ - die "The option '-D ' is only available when using Bowtie 2\n\n" unless ($bowtie2); - push @bowtie_options,"-D $seed_extension_fails"; - } - - ### RE-SEEDING REPETITIVE SEEDS - if (defined $reseed_repetitive_seeds){ - die "The option '-R ' is only available when using Bowtie 2\n\n" unless ($bowtie2); - push @bowtie_options,"-R $reseed_repetitive_seeds"; - } - - - ### BOWTIE 2 SCORING OPTIONS - if ($score_min){ - die "The option '--score_min ' is only available when using Bowtie 2\n\n" unless ($bowtie2); - unless ($score_min =~ /^L,.+,.+$/){ - die "The option '--score_min ' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; - } - push @bowtie_options,"--score-min $score_min"; - } - else{ - if ($bowtie2){ - push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2 - } - } - - ### BOWTIE 2 READ GAP OPTIONS - my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend); - - if ($rdg){ - die "The option '--rdg ,' is only available when using Bowtie 2\n\n" unless ($bowtie2); - if ($rdg =~ /^(\d+),(\d+)$/){ - $deletion_open = $1; - $deletion_extend = $2; - } - else{ - die "The option '--rdg ,' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; - } - push @bowtie_options,"--rdg $rdg"; - } - else{ - $deletion_open = 5; - $deletion_extend = 3; - } - - ### BOWTIE 2 REFERENCE GAP OPTIONS - if ($rfg){ - die "The option '--rfg ,' is only available when using Bowtie 2\n\n" unless ($bowtie2); - if ($rfg =~ /^(\d+),(\d+)$/){ - $insertion_open = $1; - $insertion_extend = $2; - } - else{ - die "The option '--rfg ,' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; - } - push @bowtie_options,"--rfg $rfg"; - } - else{ - $insertion_open = 5; - $insertion_extend = 3; - } - - - ### BOWTIE 2 PARALLELIZATION OPTIONS - if (defined $parallel){ - die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2); - } - if ($bowtie2){ - if ($parallel){ - die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); - push @bowtie_options,"-p $parallel"; - push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work. - print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n"; - sleep (2); - } - } - - ### REPORTING OPTIONS - - if ($bowtie2){ - push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default - - ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while - if(defined $most_valid_alignments){ - - warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n"; - # push @bowtie_options,"-M $most_valid_alignments";sleep (5); - } - # else{ - # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence - # } - } - else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1 - push @bowtie_options,'-k 2'; - } - - ### --BEST - if ($bowtie2){ - if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used - die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n"; - } - } - else{ - # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process) - unless ($best){ - push @bowtie_options,'--best'; - } - } - - ### VANILLA BISMARK (BOWTIE 1) OUTPUT - if ($vanilla){ - if ($bowtie2){ - die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n"; - } - } - else{ - $vanilla = 0; - } - - ### PAIRED-END MAPPING - if ($mates1){ - my @mates1 = (split (/,/,$mates1)); - die "Paired-end mapping requires the format: -1 -2 , please respecify!\n" unless ($mates2); - my @mates2 = (split(/,/,$mates2)); - unless (scalar @mates1 == scalar @mates2){ - die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 -2 )\n"; - } - while (1){ - my $mate1 = shift @mates1; - my $mate2 = shift @mates2; - last unless ($mate1 and $mate2); - push @filenames,"$mate1,$mate2"; - } - if ($bowtie2){ - push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments - push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones - } - - if ($old_flag){ - warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); - sleep(3); - } - } - elsif ($mates2){ - die "Paired-end mapping requires the format: -1 -2 , please respecify!\n"; - } - - ### SINGLE-END MAPPING - # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified - my $singles; - unless ($mates1 and $mates2){ - $singles = join (',',@ARGV); - unless ($singles){ - die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n"; - } - $singles =~ s/\s/,/g; - @filenames = (split(/,/,$singles)); - warn "\nFiles to be analysed:\n"; - warn "@filenames\n\n"; - sleep (3); - } - - ### MININUM INSERT SIZE (PAIRED-END ONLY) - if (defined $minins){ - die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles); - push @bowtie_options,"--minins $minins"; - } - - ### MAXIMUM INSERT SIZE (PAIRED-END ONLY) - if (defined $maxins){ - die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles); - push @bowtie_options,"--maxins $maxins"; - } - else{ - unless ($singles){ - push @bowtie_options,'--maxins 500'; - } - } - - ### QUIET prints nothing besides alignments (suppresses warnings) - if ($quiet){ - push @bowtie_options,'--quiet'; - } - - ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments - unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option - if (defined $chunk){ - push @bowtie_options,"--chunkmbs $chunk"; - } - else{ - push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default) - } - } - - - ### SUMMARY OF ALL BOWTIE OPTIONS - my $bowtie_options = join (' ',@bowtie_options); - - - ### STRAND-SPECIFIC LIBRARIES - my $directional; - if ($non_directional){ - die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat); - warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n"; - sleep (3); - $directional = 0; - } - elsif($pbat){ - die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip); - die "The option --pbat is currently not working for Bowtie 2. Please run alignments in default (i.e. Bowtie 1) mode!\n" if ($bowtie2); - die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta); - - warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n"; - sleep (3); - $directional = 0; - } - else{ - warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n"; - sleep (3); - $directional = 1; # default behaviour - } - - ### UNMAPPED SEQUENCE OUTPUT - $unmapped = 0 unless ($unmapped); - - ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT - $multi_map = 0 unless ($multi_map); - - - ### OUTPUT DIRECTORY - - chdir $parent_dir or die "Failed to move back to current working directory\n"; - if ($output_dir){ - unless ($output_dir =~ /\/$/){ - $output_dir =~ s/$/\//; - } - - if (chdir $output_dir){ - $output_dir = getcwd; # making the path absolute - unless ($output_dir =~ /\/$/){ - $output_dir =~ s/$/\//; - } - } - else{ - mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; - warn "Created output directory $output_dir!\n\n"; - chdir $output_dir or die "Failed to move to $output_dir\n"; - $output_dir = getcwd; # making the path absolute - unless ($output_dir =~ /\/$/){ - $output_dir =~ s/$/\//; - } - } - warn "Output will be written into the directory: $output_dir\n"; - } - else{ - $output_dir = ''; - } - - ### TEMPORARY DIRECTORY for C->T and G->A transcribed files - - chdir $parent_dir or die "Failed to move back to current working directory\n"; - if ($temp_dir){ - warn "\nUsing temp directory: $temp_dir\n"; - unless ($temp_dir =~ /\/$/){ - $temp_dir =~ s/$/\//; - } - - if (chdir $temp_dir){ - $temp_dir = getcwd; # making the path absolute - unless ($temp_dir =~ /\/$/){ - $temp_dir =~ s/$/\//; - } - } - else{ - mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n"; - warn "Created temporary directory $temp_dir!\n\n"; - chdir $temp_dir or die "Failed to move to $temp_dir\n"; - $temp_dir = getcwd; # making the path absolute - unless ($temp_dir =~ /\/$/){ - $temp_dir =~ s/$/\//; - } - } - warn "Temporary files will be written into the directory: $temp_dir\n"; - } - else{ - $temp_dir = ''; - } - - ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE - if ($non_bs_mm){ - if ($vanilla){ - die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; - } - } - - ### PREFIX FOR OUTPUT FILES - if ($prefix){ - # removing trailing dots - - $prefix =~ s/\.+$//; - - warn "Using the following prefix for output files: $prefix\n\n"; - sleep(1); - } - - - return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag); -} - - - -sub generate_SAM_header{ - print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order - foreach my $chr (keys %chromosomes){ - my $length = length ($chromosomes{$chr}); - print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length - } - print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version -} - -### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format: -### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011) - -sub single_end_SAM_output{ - my ($id,$actual_seq,$methylation_call_params,$qual) = @_; - my $strand = $methylation_call_params->{$id}->{alignment_strand}; - my $chr = $methylation_call_params->{$id}->{chromosome}; - my $start = $methylation_call_params->{$id}->{position}; - my $stop = $methylation_call_params->{$id}->{end_position}; - my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence}; - my $methcall = $methylation_call_params->{$id}->{methylation_call}; - my $read_conversion = $methylation_call_params->{$id}->{read_conversion}; - my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; - my $number_of_mismatches; - if ($bowtie2){ - $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score}; - } - else{ - $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches}; - } - - ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" - ## FLAG: bitwise FLAG. Each bit is explained in the following table: - ## Bit Description Comment Value - ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1) - ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2) - ## 0x4 segment unmapped --- --- - ## 0x8 next segment in the template unmapped --- --- - ## 0x10 SEQ being reverse complemented value: 2**4 ( 16) - ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32) - ## 0x40 the first segment in the template read 1 value: 2**6 ( 64) - ## 0x80 the last segment in the template read 2 value: 2**7 (128) - ## 0x100 secondary alignment --- --- - ## 0x200 not passing quality controls --- --- - ## 0x400 PCR or optical duplicate --- --- - - ##### - - my $flag; # FLAG variable used for SAM format. - if ($strand eq "+"){ - if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ - $flag = 0; # 0 for "+" strand (OT) - } - elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ - $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand) - } - else{ - die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; - } - } - elsif ($strand eq "-"){ - if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ - $flag = 16; # 16 for "-" strand (OB) - } - elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ - $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand) - } - else{ - die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; - } - } - else{ - die "Unexpected strand information: $strand\n\n"; - } - - ##### - - my $mapq = 255; # Assume mapping quality is unavailable - - ##### - - my $cigar; - if ($bowtie2){ - $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2 - } - else{ - $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches) - } - - ##### - - my $rnext = "*"; # Paired-end variable - - ##### - - my $pnext = 0; # Paired-end variable - - ##### - - my $tlen = 0; # Paired-end variable - - ##### - - if ($read_conversion eq 'CT'){ - $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands - } - else{ - $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries - } - - if ($strand eq '-'){ - $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand - $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence - $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well - } - - ##### - - my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string - # into the reference string. hemming_dist() - if ($bowtie2){ - $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence - } - - my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences - - ##### - - my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!) - - ##### - - my $XM_tag; # Optional tag XM: Methylation Call String - if ($strand eq '+'){ - $XM_tag = "XM:Z:$methcall"; - } - elsif ($strand eq '-'){ - $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well - } - - ##### - - my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion - - ##### - - my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion - - ##### - - # Optionally calculating number of mismatches for Bowtie 2 alignments - - if ($non_bs_mm) { - if ($bowtie2) { - - $number_of_mismatches =~ s/-//; # removing the minus sign - - ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches - if ($cigar =~ /(D|I)/) { - # warn "$cigar\n"; - - # parsing CIGAR string - my @len = split (/\D+/,$cigar); # storing the length per operation - my @ops = split (/\d+/,$cigar); # storing the operation - shift @ops; # remove the empty first element - die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); - - foreach (0..$#len) { - if ($ops[$_] eq 'M') { - # warn "skipping\n"; - next; # irrelevant - } - elsif ($ops[$_] eq 'I') { # insertion in the read sequence - $number_of_mismatches -= $insertion_open; - $number_of_mismatches -= $len[$_] * $insertion_extend; - # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; - } - elsif ($ops[$_] eq 'D') { # deletion in the read sequence - $number_of_mismatches -= $deletion_open; - $number_of_mismatches -= $len[$_] * $deletion_extend; - # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; - } - elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die - die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; - } - else { - die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; - } - } - # warn "Alignment score $number_of_mismatches\n"; - # print "Mismatches $number_of_mismatches\n\n"; - } - ### Now we have InDel corrected alignment scores - - ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the - ### sequence contained more than 5 Ns, but this should occur close to never - - my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division - # warn "N count: $seq_N_count\n"; - $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count; - # warn "MM $number_of_mismatches\n"; - } - } - - #### - - my $XA_tag = "XA:Z:$number_of_mismatches"; - - ##### - - # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields - ### optionally print number of non-bisulfite mismatches - if ($non_bs_mm){ - print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n"; - } - else{ # default - # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields - print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n"; - } -} - -sub paired_end_SAM_output{ - my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_; - my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand - my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2}; - my $chr = $methylation_call_params->{$id}->{chromosome}; - my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1}; - my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2}; - my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1}; - my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; - my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; - my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; - my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; - - my $id_1; - my $id_2; - - if ($old_flag){ - $id_1 = $id.'/1'; - $id_2 = $id.'/2'; - } - else{ - $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard - $id_2 = $id; - } - - # Allows all degenerate nucleotide sequences in reference genome - die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i; - die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i; - - my $index; # used to store the srand origin of the alignment in a less convoluted way - - if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){ - $index = 0; ## this is OT (original top strand) - } - elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){ - $index = 1; ## this is CTOB (complementary to OB) - } - elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){ - $index = 2; ## this is CTOT (complementary to OT) - } - elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){ - $index = 3; ## this is OB (original bottom) - } - else { - die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n"; - } - - my $number_of_mismatches_1; - my $number_of_mismatches_2; - - if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine - $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default! - $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2}; - } - else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation - if ($index == 2 or $index == 3){ # CTOT or OB - $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default! - $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1}; - } - else{ # if the first read aligned in forward direction it is like for Bowtie 2 - $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default! - $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2}; - } - } - - - - ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the - ### first or last position. - - if ($index == 0 or $index == 3){ # OT or OB - $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2); - $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2); - } - else{ # CTOT or CTOB - $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2); - $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2); - } - - ##### - - my $start_read_1; - my $start_read_2; - # adjusting end positions - - if ($bowtie2){ - $start_read_1 = $methylation_call_params->{$id}->{position_1}; - $start_read_2 = $methylation_call_params->{$id}->{position_2}; - } - else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 - if ($strand_1 eq '+'){ # Read 1 aligns to the + strand - $start_read_1 = $methylation_call_params->{$id}->{start_seq_1}; - $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1; - } - else{ # read 1 is on the - strand - $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1; - $start_read_2 = $methylation_call_params->{$id}->{start_seq_1}; - } - } - - ##### - - my $end_read_1; - my $end_read_2; - # adjusting end positions - - if ($bowtie2){ - $end_read_1 = $methylation_call_params->{$id}->{end_position_1}; - $end_read_2 = $methylation_call_params->{$id}->{end_position_2}; - } - else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 - if ($strand_1 eq '+'){ # Read 1 aligns to the + strand - $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1; - $end_read_2 = $methylation_call_params->{$id}->{alignment_end}; - } - else{ - $end_read_1 = $methylation_call_params->{$id}->{alignment_end}; - $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1; - } - } - - ##### - - ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" - ## FLAG: bitwise FLAG. Each bit is explained in the following table: - ## Bit Description Comment Value - ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1) - ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2) - ## 0x4 segment unmapped --- --- - ## 0x8 next segment in the template unmapped --- --- - ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16) - ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32) - ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64) - ## 0x80 the last segment in the template read 2 value: 2^^7 (128) - ## 0x100 secondary alignment --- --- - ## 0x200 not passing quality controls --- --- - ## 0x400 PCR or optical duplicate --- --- - - ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account - - # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) - # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) - - my $flag_1; # FLAG variable used for SAM format - my $flag_2; - - ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH) - - if ($index == 0){ # OT - unless ($old_flag){ - $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) - $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) - } - else{ - $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) - $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) - } - } - elsif ($index == 1){ # CTOB - unless($old_flag){ - $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) - $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) - } - else{ - $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) - $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) - } - } - elsif ($index == 2){ # CTOT - unless ($old_flag){ - $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) - $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) - } - else{ - $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) - $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) - } - } - elsif ($index == 3){ # OB - unless ($old_flag){ - $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) - $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) - } - else{ - $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) - $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) - } - } - - ##### - - my $mapq = 255; # Mapping quality is unavailable - - ##### - - my $cigar_1; - my $cigar_2; - - if ($bowtie2){ - $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2 - $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2}; - } - else{ - $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches) - $cigar_2 = length($actual_seq_2) . "M"; - } - - ##### - - my $rnext = '='; # Chromosome of mate; applies to both reads - - ##### - - my $pnext_1 = $start_read_2; # Leftmost position of mate - my $pnext_2 = $start_read_1; - - ##### - - my $tlen_1; # signed observed Template LENgth (or inferred fragment size) - my $tlen_2; - - if ($bowtie2){ - - if ($start_read_1 <= $start_read_2){ - - # Read 1 alignment is leftmost - - if ($end_read_2 >= $end_read_1){ - - # -------------------------> read 1 reads overlapping - # <------------------------- read 2 - # - # or - # - # -------------------------> read 1 - # <----------------------- read 2 read 2 contained within read 1 - # - # or - # - # -------------------------> read 1 reads 1 and 2 exactly overlapping - # <------------------------- read 2 - # - - # dovetailing of reads is not enabled for Bowtie 2 alignments - - $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, - $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign - } - elsif ($end_read_2 < $end_read_1){ - - # -------------------------> read 1 - # <----------- read 2 read 2 contained within read 1 - # - # or - # - # -------------------------> read 1 - # <------------------------ read 2 read 2 contained within read 1 - - # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable - $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, - $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this - ### as a request by frozenlyse on SeqAnswers on 24 July 2013 - } - - } - - elsif ($start_read_2 < $start_read_1){ - - if ($end_read_1 >= $end_read_2){ - - # Read 2 alignment is leftmost - - # -------------------------> read 2 reads overlapping - # <------------------------- read 1 - # - # or - # - # -------------------------> read 2 - # <----------------------- read 1 read 1 contained within read 2 - # - # - - $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, - $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign - } - elsif ($end_read_1 < $end_read_2){ - - # -------------------------> read 2 - # <----------- read 1 read 1 contained within read 2 - # - # or - # - # -------------------------> read 2 - # <------------------------ read 1 read 1 contained within read 2 - - # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable - $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, - $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this - ### as a request by frozenlyse on SeqAnswers on 24 July 2013 - } - } - } - - else{ # Bowtie 1 - - if ($end_read_2 >= $end_read_1){ - # Read 1 alignment is leftmost - # -------------------------> read 1 - # <------------------------- read 2 - # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing - - $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, - $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign - } - else{ - # Read 2 alignment is leftmost - # -------------------------> read 2 - # <------------------------- read 1 - # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing - - $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, - $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign - } - } - - ##### - - # adjusting the strand of the sequence before we use them to generate mismatch strings - if ($strand_1 eq '-'){ - $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand - $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence - $qual_1 = reverse $qual_1; # we need to reverse the quality string as well - } - if ($strand_2 eq '-'){ - $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand - $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence - $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well - } - - # print "$actual_seq_1\n$ref_seq_1\n\n"; - # print "$actual_seq_2\n$ref_seq_2\n\n"; - - ##### - - my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence - my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2); - if ($bowtie2){ - $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence - $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence - } - my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences - my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences - - ##### - - my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!) - my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2); - - ##### - - my $XM_tag_1; # Optional tag XM: Methylation call string - my $XM_tag_2; - - if ($strand_1 eq '-'){ - $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented - } - else{ - $XM_tag_1 = "XM:Z:$methcall_1"; - } - - if ($strand_2 eq '-'){ - $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented - } - else{ - $XM_tag_2 = "XM:Z:$methcall_2"; - } - - ##### - - my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state - my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state - - ##### - - my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads - - ##### - - # Optionally calculating number of mismatches for Bowtie 2 alignments - - if ($non_bs_mm) { - if ($bowtie2) { - - $number_of_mismatches_1 =~ s/-//; # removing the minus sign - $number_of_mismatches_2 =~ s/-//; - - ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches - - ### CIGAR 1 - if ($cigar_1 =~ /(D|I)/) { - # warn "$cigar_1\n"; - - # parsing CIGAR string - my @len = split (/\D+/,$cigar_1); # storing the length per operation - my @ops = split (/\d+/,$cigar_1); # storing the operation - shift @ops; # remove the empty first element - die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); - - foreach (0..$#len) { - if ($ops[$_] eq 'M') { - # warn "skipping\n"; - next; # irrelevant - } - elsif ($ops[$_] eq 'I') { # insertion in the read sequence - $number_of_mismatches_1 -= $insertion_open; - $number_of_mismatches_1 -= $len[$_] * $insertion_extend; - # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; - } - elsif ($ops[$_] eq 'D') { # deletion in the read sequence - $number_of_mismatches_1 -= $deletion_open; - $number_of_mismatches_1 -= $len[$_] * $deletion_extend; - # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; - } - elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die - die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; - } - else { - die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; - } - } - - # warn "Alignment score $number_of_mismatches_1\n"; - # print "Mismatches $number_of_mismatches_1\n\n"; - } - - ### CIGAR 2 - if ($cigar_2 =~ /(D|I)/) { - # warn "$cigar_2\n"; - - # parsing CIGAR string - my @len = split (/\D+/,$cigar_2); # storing the length per operation - my @ops = split (/\d+/,$cigar_2); # storing the operation - shift @ops; # remove the empty first element - die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); - - foreach (0..$#len) { - if ($ops[$_] eq 'M') { - # warn "skipping\n"; - next; #irrelevant - } - elsif ($ops[$_] eq 'I') { # insertion in the read sequence - $number_of_mismatches_2 -= $insertion_open; - $number_of_mismatches_2 -= $len[$_] * $insertion_extend; - # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; - } - elsif ($ops[$_] eq 'D') { # deletion in the read sequence - $number_of_mismatches_2 -= $deletion_open; - $number_of_mismatches_2 -= $len[$_] * $deletion_extend; - # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; - } - elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die - die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; - } - else { - die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; - } - } - } - - ### Now we have InDel corrected Alignment scores - - ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the - ### sequence contained more than 5 Ns, but this should occur close to never - - my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division - my $seq_2_N_count = $number_of_mismatches_2 % 6; - # warn "N count 1: $seq_1_N_count\n"; - # warn "N count 2: $seq_2_N_count\n"; - - $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count; - $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count; - - # warn "MM1 $number_of_mismatches_1 \n"; - # warn "MM2 $number_of_mismatches_2 \n"; - } - } - - #### - - my $XA_tag = "XA:Z:$number_of_mismatches_1"; - my $XB_tag = "XB:Z:$number_of_mismatches_2"; - - - # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields - ### optionally print number of non-bisulfite mismatches - if ($non_bs_mm){ - print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n"; - print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n"; - } - else{ # default - print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n"; - print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n"; - } -} - -sub revcomp{ - my $seq = shift or die "Missing seq to reverse complement\n"; - $seq = reverse $seq; - $seq =~ tr/ACTGactg/TGACTGAC/; - return $seq; -} - -sub hemming_dist{ - my $matches = 0; - my @actual_seq = split //,(shift @_); - my @ref_seq = split //,(shift @_); - foreach (0..$#actual_seq){ - ++$matches if ($actual_seq[$_] eq $ref_seq[$_]); - } - return my $hd = scalar @actual_seq - $matches; -} - -sub make_mismatch_string{ - my $actual_seq = shift or die "Missing actual sequence"; - my $ref_seq = shift or die "Missing reference sequence"; - my $XX_tag = "XX:Z:"; - my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison - my $prev_mm_pos = 0; - while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference - my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch - my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read - $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other - $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation) - $prev_mm_pos = pos($tmp); # Position of last mismatch - } - my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence - $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence - return $XX_tag; -} - - - -sub print_helpfile{ - print << "HOW_TO"; - - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . - - - -DESCRIPTION - - -The following is a brief description of command line options and arguments to control the Bismark -bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the -reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand -version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). -Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome -(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the -forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) -are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original -sequence from the genome and determine if there were any protected C's present or not. - -As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be -re-enabled by using --non_directional. - -The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old -'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best -alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below. - - -USAGE: bismark [options] {-1 -2 | } - - -ARGUMENTS: - - The path to the folder containing the unmodified reference genome - as well as the subfolders created by the Bismark_Genome_Preparation - script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/). - Bismark expects one or more fastA files in this folder (file extension: .fa - or .fasta). The path can be relative or absolute. - --1 Comma-separated list of files containing the #1 mates (filename usually includes - "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must - correspond file-for-file and read-for-read with those specified in . - Reads may be a mix of different lengths. Bismark will produce one mapping result - and one report file per paired-end input file pair. - --2 Comma-separated list of files containing the #2 mates (filename usually includes - "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must - correspond file-for-file and read-for-read with those specified in . - Reads may be a mix of different lengths. - - A comma- or space-separated list of files containing the reads to be aligned (e.g. - lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will - produce one mapping result and one report file per input file. - - -OPTIONS: - - -Input: - --q/--fastq The query input files (specified as , or are FASTQ - files (usually having extension .fg or .fastq). This is the default. See also - --solexa-quals. - --f/--fasta The query input files (specified as , or are FASTA - files (usually havin extension .fa, .mfa, .fna or similar). All quality values - are assumed to be 40 on the Phred scale. FASTA files are expected to contain both - the read name and the sequence on a single line (and not spread over several lines). - --s/--skip Skip (i.e. do not align) the first reads or read pairs from the input. - --u/--upto Only aligns the first reads or read pairs from the input. Default: no limit. - ---phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. - ---phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. - ---solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled - (which can't). The formula for conversion is: - phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This - is usually the right option for use with (unconverted) reads emitted by the GA - Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. - ---solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) - reads emitted by GA Pipeline version 1.3 or later. Default: off. - ---path_to_bowtie The full path to the Bowtie (1 or 2) installation on your system. If not - specified it is assumed that Bowtie (1 or 2) is in the PATH. - - -Alignment: - --n/--seedmms The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs - of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the - default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). - --l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to - which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for - larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). - --e/--maqerr Maximum permitted total of quality values at all mismatched read positions throughout - the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds - quality values to the nearest 10 and saturates at 30. This value is not relevant for - Bowtie 2. - ---chunkmbs The number of megabytes of memory a given thread is given to store path descriptors in - --best mode. Best-first search must keep track of many paths at once to ensure it is - always extending the path with the lowest cumulative cost. Bowtie tries to minimize the - memory impact of the descriptors, but they can still grow very large in some cases. If - you receive an error message saying that chunk memory has been exhausted in --best mode, - try adjusting this parameter up to dedicate more memory to the descriptors. This value - is not relevant for Bowtie 2. Default: 512. - --I/--minins The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and - a paired-end alignment consists of two 20-bp alignments in the appropriate orientation - with a 20-bp gap between them, that alignment is considered valid (as long as -X is also - satisfied). A 19-bp gap would not be valid in that case. Default: 0. - --X/--maxins The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and - a paired-end alignment consists of two 20-bp alignments in the proper orientation with a - 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). - A 61-bp gap would not be valid in that case. Default: 500. - - -Bowtie 1 Reporting: - --k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option - will be used by default. - ---best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum - (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in - terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred - quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both - have Phred quality 10. When --best is not specified, Bowtie may report alignments that - are sub-optimal in terms of stratum and/or quality (though an effort is made to report - the best alignment). --best mode also removes all strand bias. Note that --best does not - affect which alignments are considered "valid" by Bowtie, only which valid alignments - are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified. - Default: on. - ---no_best Disables the --best option which is on by default. This can speed up the alignment process, - e.g. for testing purposes, but for credible results it is not recommended to disable --best. - - -Output: - ---non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four - bisulfite strands will be reported. Default: OFF. - - (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary - to the original strands are merely theoretical and should not exist in reality. Specifying directional - alignments (which is the default) will only run 2 alignment threads to the original top (OT) - or bottom (OB) strands in parallel and report these alignments. This is the recommended option - for sprand-specific libraries). - ---pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al., - PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode, - as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT - and OB ones. Use this option only if you are certain that your libraries were constructed following - a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option - --pbat works only for single-end and paired-end FastQ files for use with Bowtie1 (uncompressed - temporary files only). - ---sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are - split up into several smaller files to run concurrently and the output files are to be merged. - ---quiet Print nothing besides alignments. - ---vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead - of SAM format output. - --un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will - appear as they did in the input, without any translation of quality values that may have - taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 - and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads - with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) - are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. - ---ambiguous Write all reads which produce more than one valid alignment with the same number of lowest - mismatches or other reads that fail to align uniquely to a file in the output directory. - Written reads will appear as they did in the input, without any of the translation of quality - values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two - parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and - _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. - --o/--output_dir Write all output files into this directory. By default the output files will be written into - the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt - to create it first. The path to the output folder can be either relative or absolute. - ---temp_dir Write temporary files to this directory instead of into the same directory as the input files. If - the specified folder does not exist, Bismark will attempt to create it first. The path to the - temporary folder can be either relative or absolute. - ---non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the - alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is - just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions. - The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches' - and 'XB:Z:number of mismatches' for read 2 of paired-end reads. - ---gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk - space. This option is available for most alignment modes but is not available for paired-end FastA - files. This option might be somewhat slower than writing out uncompressed files, but this awaits - further testing. - ---bam The output will be written out in BAM format instead of the default SAM format. Bismark will - attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't - been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found, - the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). - ---samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified - explicitly if Samtools is in the PATH already. - ---prefix Prefixes to the output filenames. Trailing dots will be replaced by a single one. For - example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. - ---old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, - this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since - both the appended read IDs and custom FLAG values may cause problems with some downstream tools - such as Picard, new defaults were implemented as of version 0.8.3. - - - default old_flag - =================== =================== - Read 1 Read 2 Read 1 Read 2 - - OT: 99 147 67 131 - - OB: 83 163 115 179 - - CTOT: 99 147 67 131 - - CTOB: 83 163 115 179 - - - -Other: - --h/--help Displays this help file. - --v/--version Displays version information. - - -BOWTIE 2 SPECIFIC OPTIONS - ---bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end - alignments, i.e. searches for alignments involving all read characters (also called - untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter - and/or quality trimmed where appropriate. Default: off. - -Bowtie 2 alignment options: - --N Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. - Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) - but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for - Bowtie 1 see -n). - --L Sets the length of the seed substrings to align during multiseed alignment. Smaller values - make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is - used by default, which sets -L to 20. This option is only available for Bowtie 2 (for - Bowtie 1 see -l). - ---ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched - position to be the highest possible, regardless of the actual value. I.e. input is treated - as though all quality values are high. This is also the default behavior when the input - doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. - - -Bowtie 2 paired-end options: - ---no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if - it cannot find a concordant or discordant alignment for a pair. This option is invariable and - and on by default. - ---no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. - A discordant alignment is an alignment where both mates align uniquely, but that does not - satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior - and it is on by default. - - -Bowtie 2 effort options: - --D Up to consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using - the alignments found so far. A seed extension "fails" if it does not yield a new best or a - new second-best alignment. Default: 15. - --R is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. - When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of - mismatches allowed) at different offsets and searches for more alignments. A read is considered - to have repetitive seeds if the total number of seed hits divided by the number of seeds - that aligned at least once is greater than 300. Default: 2. - -Bowtie 2 parallelization options: - - --p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores - and synchronize when parsing reads and outputting alignments. Searching for alignments is highly - parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. - E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint - by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads - library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will - automatically use the option '--reorder', which guarantees that output SAM records are printed in - an order corresponding to the order of the reads in the original input file, even when -p is set - greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and - setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then - if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally - correspond to input order in that case. - -Bowtie 2 Scoring options: - ---score_min Sets a function governing the minimum alignment score needed for an alignment to be considered - "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying - L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. - See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is - L,0,-0.2. - ---rdg , Sets the read gap open () and extend () penalties. A read gap of length N gets a penalty - of + N * . Default: 5, 3. - ---rfg , Sets the reference gap open () and extend () penalties. A reference gap of length N gets - a penalty of + N * . Default: 5, 3. - - -Bowtie 2 Reporting options: - --most_valid_alignments This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is - deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the - default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the - effort expended to find valid alignments. - - For reference, this used to be the old (now deprecated) description of -M: - Bowtie 2 searches for at most +1 distinct, valid alignments for each read. The search terminates when it - can't find more distinct valid alignments, or when it finds +1 distinct alignments, whichever - happens first. Only the best alignment is reported. Information from the other alignments is used to - estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes - Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that - aligns many places. For reads that have more than +1 distinct, valid alignments, Bowtie 2 does not - guarantee that the alignment reported is the best possible in terms of alignment score. -M is - always used and its default value is set to 10. - - -'VANILLA' Bismark OUTPUT: - -Single-end output format (tab-separated): - - (1) - (2) - (3) - (4) - (5) - (6) - (7) - (8) - (9) -(11) - - -Paired-end output format (tab-separated): - (1) - (2) - (3) - (4) - (5) - (6) - (7) - (8) - (9) -(10) -(11) -(12) -(14) -(15) - - -Bismark SAM OUTPUT (default): - - (1) QNAME (seq-ID) - (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!)) - (3) RNAME (chromosome) - (4) POS (start position) - (5) MAPQ (always 255) - (6) CIGAR - (7) RNEXT - (8) PNEXT - (9) TLEN -(10) SEQ -(11) QUAL (Phred33 scale) -(12) NM-tag (edit distance to the reference) -(13) XX-tag (base-by-base mismatches to the reference. This does not include indels) -(14) XM-tag (methylation call string) -(15) XR-tag (read conversion state for the alignment) -(16) XG-tag (genome conversion state for the alignment) -(17) XA/XB-tag (non-bisulfite mismatches) (optional!) - -Each read of paired-end alignments is written out in a separate line in the above format. - - -Last edited on 07 October 2013. - -HOW_TO -} diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_bowtie2_wrapper.xml --- a/bismark_bowtie2_wrapper.xml Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,666 +0,0 @@ - - - bisulfite mapper (bowtie2) - - - SCRIPT_PATH - samtools - bowtie2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - (( - params['settingsType'] == "custom" and - params['isReportOutput'] is True - )) - - - - - (( - params['settingsType'] == "custom" and - params['bismark_stdout'] is True - )) - - - - - - - - - - - - - - - - - - - - - - (( - params['settingsType'] == "custom" and - params['suppressed_read_file'] is True - )) - - - - - - - - - - - - - - - - - (( - singlePaired['sPaired'] == "paired" and - params['settingsType'] == "custom" and - params['suppressed_read_file'] is True - )) - - - - - - - - - - - - - - - - - - (( - params['settingsType'] == "custom" and - params['unmapped_read_file'] is True - )) - - - - - - - - - - - - - - - - - (( - singlePaired['sPaired'] == "paired" and - params['settingsType'] == "custom" and - params['unmapped_read_file'] is True - )) - - - - - - - - - - - - - - - - - - - -T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). -Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome -(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the -forward strand, by doing this alignments will produce the same positions). These instances of Bowtie 2 -are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original -sequence from the genome and determine if there were any protected C's present or not. - -.. _Bismark: http://www.bioinformatics.babraham.ac.uk/projects/bismark/ - -As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be -re-enabled by using non_directional mode. - -It is developed by Krueger F and Andrews SR. at the Babraham Institute. Krueger F, Andrews SR. (2011) Bismark: a flexible aligner and methylation caller for Bisulfite-Seq applications. Bioinformatics, 27, 1571-2. - ------- - -**Know what you are doing** - -.. class:: warningmark - -There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. - - .. __: http://www.bioinformatics.babraham.ac.uk/projects/bismark/ - - -.. class:: warningmark - -Make sure all your input reads are in the correct and same format. If thats not the case please adjust/convert the filetype with galaxy's build-in converters. - ------- - -**Input formats** - -Bismark accepts files in either Sanger FASTQ format (galaxy type *fastqsanger*), Illumina FASTQ format (galaxy type *fastqillumina*) or FASTA format (galaxy type *fasta*). Use the FASTQ Groomer to prepare your files. - ------- - -**A Note on Built-in Reference Genomes** - -The default variant for all genomes is "Full", defined as all primary chromosomes (or scaffolds/contigs) including mitochondrial plus associated unmapped, plasmid, and other segments. When only one version of a genome is available in this tool, it represents the default "Full" variant. Some genomes will have more than one variant available. The "Canonical Male" or sometimes simply "Canonical" variant contains the primary chromosomes for a genome. For example a human "Canonical" variant contains chr1-chr22, chrX, chrY, and chrM. The "Canonical Female" variant contains the primary chromosomes excluding chrY. - ------- - -The final output of Bismark is in SAM format by default. - -**Outputs** - -The output is in SAM format, and has the following columns:: - - Column Description - -------- -------------------------------------------------------- - 1 QNAME seq-ID - 2 FLAG this flag tries to take the strand a bisulfite read - originated from into account - (this is different from ordinary DNA alignment flags!) - 3 RNAME chromosome - 4 POS start position - 5 MAPQ always 255 - 6 CIGAR extended CIGAR string - 7 MRNM Mate Reference sequence NaMe ('=' if same as RNAME) - 8 MPOS 1-based Mate POSition - 9 ISIZE Inferred insert SIZE - 10 SEQ query SEQuence on the same strand as the reference - 11 QUAL Phred33 scale - 12 NM-tag edit distance to the reference) - 13 XX-tag base-by-base mismatches to the reference. - This does not include indels. - 14 XM-tag methylation call string - 15 XR-tag read conversion state for the alignment - 16 XG-tag genome conversion state for the alignment - - -Each read of paired-end alignments is written out in a separate line in the above format. - - -It looks like this (scroll sideways to see the entire example):: - - QNAME FLAG RNAME POS MAPQ CIAGR MRNM MPOS ISIZE SEQ QUAL OPT - HWI-EAS91_1_30788AAXX:1:1:1761:343 4 * 0 0 * * 0 0 AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh - HWI-EAS91_1_30788AAXX:1:1:1578:331 4 * 0 0 * * 0 0 GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh - -------- - -**Bismark settings** - -All of the options have a default value. You can change any of them. If any Bismark function is missing please contact the tool author or your Galaxy admin. - ------- - -**Bismark parameter list** - -This is an exhaustive list of Bismark options. - -Input:: - - --singles A comma- or space-separated list of files containing the reads to be aligned (e.g. - lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will - produce one mapping result and one report file per input file. - - -1 mates1 Comma-separated list of files containing the #1 mates (filename usually includes - "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must - correspond file-for-file and read-for-read with those specified in mates2. - Reads may be a mix of different lengths. Bismark will produce one mapping result - and one report file per paired-end input file pair. - - -2 mates2 Comma-separated list of files containing the #2 mates (filename usually includes - "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must - correspond file-for-file and read-for-read with those specified in mates1. - Reads may be a mix of different lengths. - - -q/--fastq The query input files (specified as mate1,mate2 or singles are FASTQ - files (usually having extension .fg or .fastq). This is the default. See also - --solexa-quals. - - -f/--fasta The query input files (specified as mate1,mate2 or singles are FASTA - files (usually havin extension .fa, .mfa, .fna or similar). All quality values - are assumed to be 40 on the Phred scale. - - -s/--skip INT Skip (i.e. do not align) the first INT reads or read pairs from the input. - - -u/--upto INT Only aligns the first INT reads or read pairs from the input. Default: no limit. - - --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. - - --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. - - --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled - (which can't). The formula for conversion is: - phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This - is usually the right option for use with (unconverted) reads emitted by the GA - Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. - - --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) - reads emitted by GA Pipeline version 1.3 or later. Default: off. - - -Alignment:: - - -n/--seedmms INT The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs - of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the - default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). - - -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to - which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for - larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). - - -e/--maqerr INT Maximum permitted total of quality values at all mismatched read positions throughout - the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds - quality values to the nearest 10 and saturates at 30. This value is not relevant for - Bowtie 2. - - --chunkmbs INT The number of megabytes of memory a given thread is given to store path descriptors in - --best mode. Best-first search must keep track of many paths at once to ensure it is - always extending the path with the lowest cumulative cost. Bowtie tries to minimize the - memory impact of the descriptors, but they can still grow very large in some cases. If - you receive an error message saying that chunk memory has been exhausted in --best mode, - try adjusting this parameter up to dedicate more memory to the descriptors. This value - is not relevant for Bowtie 2. Default: 512. - - -I/--minins INT The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and - a paired-end alignment consists of two 20-bp alignments in the appropriate orientation - with a 20-bp gap between them, that alignment is considered valid (as long as -X is also - satisfied). A 19-bp gap would not be valid in that case. Default: 0. - - -X/--maxins INT The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and - a paired-end alignment consists of two 20-bp alignments in the proper orientation with a - 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). - A 61-bp gap would not be valid in that case. Default: 500. - - - -Output:: - - --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four - bisulfite strands will be reported. Default: OFF. - - (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary - to the original strands are merely theoretical and should not exist in reality. Specifying directional - alignments (which is the default) will only run 2 alignment threads to the original top (OT) - or bottom (OB) strands in parallel and report these alignments. This is the recommended option - for sprand-specific libraries). - - --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are - split up into several smaller files to run concurrently and the output files are to be merged. - - --quiet Print nothing besides alignments. - - --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead - of SAM format output. - - -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will - appear as they did in the input, without any translation of quality values that may have - taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 - and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads - with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) - are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. - - --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest - mismatches or other reads that fail to align uniquely to a file in the output directory. - Written reads will appear as they did in the input, without any of the translation of quality - values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two - parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and - _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. - - -o/--output_dir DIR Write all output files into this directory. By default the output files will be written into - the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt - to create it first. The path to the output folder can be either relative or absolute. - - --temp_dir DIR Write temporary files to this directory instead of into the same directory as the input files. If - the specified folder does not exist, Bismark will attempt to create it first. The path to the - temporary folder can be either relative or absolute. - ------- - -Bowtie 2 alignment options:: - - -N INT Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. - Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) - but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for - Bowtie 1 see -n). - - -L INT Sets the length of the seed substrings to align during multiseed alignment. Smaller values - make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is - used by default, which sets -L to 20. This option is only available for Bowtie 2 (for - Bowtie 1 see -l). - - --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched - position to be the highest possible, regardless of the actual value. I.e. input is treated - as though all quality values are high. This is also the default behavior when the input - doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. - - -Bowtie 2 paired-end options:: - - --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if - it cannot find a concordant or discordant alignment for a pair. This option is invariable and - and on by default. - - --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. - A discordant alignment is an alignment where both mates align uniquely, but that does not - satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior - and it is on by default. - - -Bowtie 2 effort options:: - - -D INT Up to INT consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using - the alignments found so far. A seed extension "fails" if it does not yield a new best or a - new second-best alignment. Default: 15. - - -R INT INT is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. - When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of - mismatches allowed) at different offsets and searches for more alignments. A read is considered - to have repetitive seeds if the total number of seed hits divided by the number of seeds - that aligned at least once is greater than 300. Default: 2. - - -Bowtie 2 Scoring options:: - - --score_min "func" Sets a function governing the minimum alignment score needed for an alignment to be considered - "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying - L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. - See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is - L,0,-0.2. - - -Bowtie 2 Reporting options:: - - --most_valid_alignments INT This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is - deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the - default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the - effort expended to find valid alignments. - - For reference, this used to be the old (now deprecated) description of -M: - Bowtie 2 searches for at most INT+1 distinct, valid alignments for each read. The search terminates when it - can't find more distinct valid alignments, or when it finds INT+1 distinct alignments, whichever - happens first. Only the best alignment is reported. Information from the other alignments is used to - estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes - Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that - aligns many places. For reads that have more than INT+1 distinct, valid alignments, Bowtie 2 does not - guarantee that the alignment reported is the best possible in terms of alignment score. -M is - always used and its default value is set to 10. - -]]> - - - 10.1093/bioinformatics/btr167 - - diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_bowtie_wrapper.xml --- a/bismark_bowtie_wrapper.xml Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,563 +0,0 @@ - - - bisulfite mapper (bowtie) - - - SCRIPT_PATH - samtools - bowtie - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - (( - params['settingsType'] == "custom" and - params['isReportOutput'] is True - )) - - - - - (( - params['settingsType'] == "custom" and - params['bismark_stdout'] is True - )) - - - - - - - - - - - - - - - - - - - - - - (( - params['settingsType'] == "custom" and - params['suppressed_read_file'] is True - )) - - - - - - - - - - - - - - - - singlePaired['sPaired'] == "paired" - params['settingsType'] == "custom" - params['supressed_read_file'] is True - - - - - - - - - - - - - - - - - (( - params['settingsType'] == "custom" and - params['unmapped_read_file'] is True - )) - - - - - - - - - - - - - - - singlePaired['sPaired'] == "paired" - params['settingsType'] == "custom" - params['unmapped_read_file'] is True - - - - - - - - - - - - - - - - - - -T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). -Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome -(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the -forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) -are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original -sequence from the genome and determine if there were any protected C's present or not. - -.. _Bismark: http://www.bioinformatics.babraham.ac.uk/projects/bismark/ - -As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be -re-enabled by using non_directional mode. - -It is developed by Krueger F and Andrews SR. at the Babraham Institute. Krueger F, Andrews SR. (2011) Bismark: a flexible aligner and methylation caller for Bisulfite-Seq applications. Bioinformatics, 27, 1571-2. - ------- - -**Know what you are doing** - -.. class:: warningmark - -There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. - - .. __: http://www.bioinformatics.babraham.ac.uk/projects/bismark/ - - -.. class:: warningmark - -Make sure all your input reads are in the correct and same format. If thats not the case please adjust/convert the filetype with galaxy's build-in converters. - ------- - -**Input formats** - -Bismark accepts files in either Sanger FASTQ format (galaxy type *fastqsanger*), Illumina FASTQ format (galaxy type *fastqillumina*) or FASTA format (galaxy type *fasta*). Use the FASTQ Groomer to prepare your files. - ------- - -**A Note on Built-in Reference Genomes** - -The default variant for all genomes is "Full", defined as all primary chromosomes (or scaffolds/contigs) including mitochondrial plus associated unmapped, plasmid, and other segments. When only one version of a genome is available in this tool, it represents the default "Full" variant. Some genomes will have more than one variant available. The "Canonical Male" or sometimes simply "Canonical" variant contains the primary chromosomes for a genome. For example a human "Canonical" variant contains chr1-chr22, chrX, chrY, and chrM. The "Canonical Female" variant contains the primary chromosomes excluding chrY. - ------- - -The final output of Bismark is in SAM format by default. - -**Outputs** - -The output is in SAM format, and has the following columns:: - - Column Description - -------- -------------------------------------------------------- - 1 QNAME seq-ID - 2 FLAG this flag tries to take the strand a bisulfite read - originated from into account - (this is different from ordinary DNA alignment flags!) - 3 RNAME chromosome - 4 POS start position - 5 MAPQ always 255 - 6 CIGAR extended CIGAR string - 7 MRNM Mate Reference sequence NaMe ('=' if same as RNAME) - 8 MPOS 1-based Mate POSition - 9 ISIZE Inferred insert SIZE - 10 SEQ query SEQuence on the same strand as the reference - 11 QUAL Phred33 scale - 12 NM-tag edit distance to the reference) - 13 XX-tag base-by-base mismatches to the reference. - This does not include indels. - 14 XM-tag methylation call string - 15 XR-tag read conversion state for the alignment - 16 XG-tag genome conversion state for the alignment - - -Each read of paired-end alignments is written out in a separate line in the above format. - - -It looks like this (scroll sideways to see the entire example):: - - QNAME FLAG RNAME POS MAPQ CIAGR MRNM MPOS ISIZE SEQ QUAL OPT - HWI-EAS91_1_30788AAXX:1:1:1761:343 4 * 0 0 * * 0 0 AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh - HWI-EAS91_1_30788AAXX:1:1:1578:331 4 * 0 0 * * 0 0 GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh - -------- - -**Bismark settings** - -All of the options have a default value. You can change any of them. If any Bismark function is missing please contact the tool author or your Galaxy admin. - ------- - -**Bismark parameter list** - -This is an exhaustive list of Bismark options. - -Input:: - - --singles A comma- or space-separated list of files containing the reads to be aligned (e.g. - lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will - produce one mapping result and one report file per input file. - - -1 mates1 Comma-separated list of files containing the #1 mates (filename usually includes - "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must - correspond file-for-file and read-for-read with those specified in mates2. - Reads may be a mix of different lengths. Bismark will produce one mapping result - and one report file per paired-end input file pair. - - -2 mates2 Comma-separated list of files containing the #2 mates (filename usually includes - "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must - correspond file-for-file and read-for-read with those specified in mates1. - Reads may be a mix of different lengths. - - -q/--fastq The query input files (specified as mate1,mate2 or singles are FASTQ - files (usually having extension .fg or .fastq). This is the default. See also - --solexa-quals. - - -f/--fasta The query input files (specified as mate1,mate2 or singles are FASTA - files (usually havin extension .fa, .mfa, .fna or similar). All quality values - are assumed to be 40 on the Phred scale. - - -s/--skip INT Skip (i.e. do not align) the first INT reads or read pairs from the input. - - -u/--upto INT Only aligns the first INT reads or read pairs from the input. Default: no limit. - - --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. - - --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. - - --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled - (which can't). The formula for conversion is: - phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This - is usually the right option for use with (unconverted) reads emitted by the GA - Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. - - --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) - reads emitted by GA Pipeline version 1.3 or later. Default: off. - - -Alignment:: - - -n/--seedmms INT The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs - of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the - default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). - - -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to - which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for - larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). - - -e/--maqerr INT Maximum permitted total of quality values at all mismatched read positions throughout - the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds - quality values to the nearest 10 and saturates at 30. This value is not relevant for - Bowtie 2. - - --chunkmbs INT The number of megabytes of memory a given thread is given to store path descriptors in - --best mode. Best-first search must keep track of many paths at once to ensure it is - always extending the path with the lowest cumulative cost. Bowtie tries to minimize the - memory impact of the descriptors, but they can still grow very large in some cases. If - you receive an error message saying that chunk memory has been exhausted in --best mode, - try adjusting this parameter up to dedicate more memory to the descriptors. This value - is not relevant for Bowtie 2. Default: 512. - - -I/--minins INT The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and - a paired-end alignment consists of two 20-bp alignments in the appropriate orientation - with a 20-bp gap between them, that alignment is considered valid (as long as -X is also - satisfied). A 19-bp gap would not be valid in that case. Default: 0. - - -X/--maxins INT The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and - a paired-end alignment consists of two 20-bp alignments in the proper orientation with a - 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). - A 61-bp gap would not be valid in that case. Default: 500. - - - -Output:: - - --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four - bisulfite strands will be reported. Default: OFF. - - (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary - to the original strands are merely theoretical and should not exist in reality. Specifying directional - alignments (which is the default) will only run 2 alignment threads to the original top (OT) - or bottom (OB) strands in parallel and report these alignments. This is the recommended option - for sprand-specific libraries). - - --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are - split up into several smaller files to run concurrently and the output files are to be merged. - - --quiet Print nothing besides alignments. - - --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead - of SAM format output. - - -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will - appear as they did in the input, without any translation of quality values that may have - taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 - and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads - with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) - are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. - - --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest - mismatches or other reads that fail to align uniquely to a file in the output directory. - Written reads will appear as they did in the input, without any of the translation of quality - values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two - parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and - _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. - - -o/--output_dir DIR Write all output files into this directory. By default the output files will be written into - the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt - to create it first. The path to the output folder can be either relative or absolute. - - --temp_dir DIR Write temporary files to this directory instead of into the same directory as the input files. If - the specified folder does not exist, Bismark will attempt to create it first. The path to the - temporary folder can be either relative or absolute. - -]]> - - - 10.1093/bioinformatics/btr167 - - diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_deduplicate/bismark_deduplicate_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_deduplicate/bismark_deduplicate_wrapper.py Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,66 @@ +#!/usr/bin/python + +import argparse +import os +import re +import shutil +import subprocess +import sys +import tempfile +import logging +from glob import glob + +def cleanup_before_exit(tmp_dir): + if tmp_dir and os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + +def get_arg(): + parser = argparse.ArgumentParser() + parser.add_argument('--tool_dir', dest='tool_dir', action='store', nargs=1, metavar='tool_dir', type=str) + parser.add_argument('-p', action='store_true') + parser.add_argument('-s', action='store_true') + parser.add_argument('--input', dest='input', action='store', nargs=1, metavar='input', type=str) + parser.add_argument('--output_report', dest='output_report', action='store', nargs=1, metavar='output_report', type=str) + parser.add_argument('--output_bam', dest='output_bam', action='store', nargs=1, metavar='output_report', type=str) + parser.add_argument('--log_report', dest='log_report', action='store', nargs=1, metavar='log_filename', type=str) + args = parser.parse_args() + return args + +def __main__(): + args = get_arg() + + tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') + os.chdir(tmp_dir) + + if args.log_report: + logging.basicConfig(level=logging.INFO, filename=args.log_report[0], filemode="a+", format='%(message)s') + else: + logging.basicConfig(level=logging.INFO, filename=os.path.join(tmp_dir, 'log_report.txt'), filemode="a+", format='%(message)s') + + default_reads_name = 'submitted_reads.bam' + os.symlink(args.input[0], default_reads_name) + + if args.p is True: + sPaired = '-p' + if args.s is True: + sPaired = '-s' + + cmd = 'perl %s %s duplicated_reads.bam --bam' % (os.path.join(args.tool_dir[0], 'deduplicate_bismark'), sPaired) + logging.info('COMMAND LINE:\n\n%s' % cmd) + + proc = subprocess.Popen(['perl', os.path.join(args.tool_dir[0], 'deduplicate_bismark'), sPaired, default_reads_name, '--bam'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc_out, proc_err = proc.communicate() + + logging.info("__________________________________________________________________\n") + logging.info("BISMARK DEDUPLICATE STDOUT:\n\n%s" % proc_out) + if proc_err: + logging.critical("__________________________________________________________________\n") + logging.critical("BISMARK DEDUPLICATE WARNING:\n\n%s" % proc_err) + sys.exit("Dedpulicate Bismark crashed with the folowing error message:\n%s" % proc_err) + + shutil.move( glob('*deduplicated.bam')[0], args.output_bam[0] ) + shutil.move( glob('*deduplication_report.txt')[0], args.output_report[0]) + + cleanup_before_exit(tmp_dir) + +if __name__=="__main__": __main__() \ No newline at end of file diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_deduplicate/bismark_deduplicate_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_deduplicate/bismark_deduplicate_wrapper.xml Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,69 @@ + + + Deduplicates reads mapped by Bismark + + + + samtools + bowtie2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1093/bioinformatics/btr167 + + \ No newline at end of file diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_deduplicate/deduplicate_bismark --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_deduplicate/deduplicate_bismark Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,1272 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Getopt::Long; + + +### This script is supposed to remove alignments to the same position in the genome which can arise by e.g. PCR amplification +### Paired-end alignments are considered a duplicate if both partner reasd start and end at the exact same position + +### May 13, 2013 +### Changed the single-end trimming behavior so that only the start coordinate will be used. This avoids duplicate reads that have been trimmed to a varying extent +### Changed the way of determining the end of reads in SAM format to using the CIGAR string if the read contains InDels + +### 16 July 2013 +### Adding a new deduplication mode for barcoded RRBS-Seq + +### 27 Sept 2013 +### Added close statement for all output filehandles (which should probably have been there from the start...) + +### 8 Jan 2015 +### to detect paired-end command from the @PG line we are no requiring spaces before and after the -1 or -2 + +### 09 Mar 2015 +### Removing newline characters also from the read conversion flag in case the tags had been reordered and are now present in the very last column + +### 19 08 2015 +### Hiding the option --representative from view to discourage people from using it (it was nearly always not what they wanted to do anyway). It should still work +### for alignments that do not contain any indels +### Just for reference, here is the the text: +### print "--representative\twill browse through all sequences and print out the sequence with the most representative (as in most frequent) methylation call for any given position. Note that this is very likely the most highly amplified PCR product for a given sequence\n\n"; + + +my $dedup_version = 'v0.16.3'; + +my $help; +my $representative; +my $single; +my $paired; +my $global_single; +my $global_paired; +my $vanilla; +my $samtools_path; +my $bam; +my $rrbs; +my $version; + +my $command_line = GetOptions ('help' => \$help, + 'representative' => \$representative, + 's|single' => \$global_single, + 'p|paired' => \$global_paired, + 'vanilla' => \$vanilla, + 'samtools_path=s' => \$samtools_path, + 'bam' => \$bam, + 'barcode' => \$rrbs, + 'version' => \$version, + ); + +die "Please respecify command line options\n\n" unless ($command_line); + +if ($help){ + print_helpfile(); + exit; +} + +if ($version){ + print << "VERSION"; + + Bismark Deduplication Module + + Deduplicator Version: $dedup_version + Copyright 2010-16 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + + +my @filenames = @ARGV; + +unless (@filenames){ + print "Please provide one or more Bismark output files for deduplication\n\n"; + sleep (2); + print_helpfile(); + exit; +} + + +### OPTIONS +unless ($global_single or $global_paired){ + if ($vanilla){ + die "Please specify either -s (single-end) or -p (paired-end) for deduplication. Reading this information from the \@PG header line only works for SAM/BAM files\n\n"; + } + warn "\nNeither -s (single-end) nor -p (paired-end) selected for deduplication. Trying to extract this information for each file separately from the \@PG line of the SAM/BAM file\n"; +} + +if ($global_paired){ + if ($global_single){ + die "Please select either -s for single-end files or -p for paired-end files, but not both at the same time!\n\n"; + } + if ($vanilla){ + + if ($rrbs){ + die "Barcode deduplication only works with Bismark SAM (or BAM) output (in attempt to phase out the vanilla format)\n"; + } + + warn "Processing paired-end custom Bismark output file(s):\n"; + warn join ("\t",@filenames),"\n\n"; + } + else{ + warn "Processing paired-end Bismark output file(s) (SAM format):\n"; + warn join ("\t",@filenames),"\n\n"; + } +} +else{ + if ($vanilla){ + warn "Processing single-end custom Bismark output file(s):\n"; + warn join ("\t",@filenames),"\n\n"; + } + else{ + warn "Processing single-end Bismark output file(s) (SAM format):\n"; + warn join ("\t",@filenames),"\n\n"; + } +} + +### PATH TO SAMTOOLS +if (defined $samtools_path){ + # if Samtools was specified as full command + if ($samtools_path =~ /samtools$/){ + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + else{ + unless ($samtools_path =~ /\/$/){ + $samtools_path =~ s/$/\//; + } + $samtools_path .= 'samtools'; + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } +} +# Check whether Samtools is in the PATH if no path was supplied by the user +else{ + if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH + $samtools_path = `which samtools`; + chomp $samtools_path; + } +} + +if ($bam){ + if (defined $samtools_path){ + $bam = 1; + } + else{ + warn "No Samtools found on your system, writing out a gzipped SAM file instead\n"; + $bam = 2; + } +} +else{ + $bam = 0; +} + + +if ($representative){ + warn "\nIf there are several alignments to a single position in the genome the alignment with the most representative methylation call will be chosen (this might be the most highly amplified PCR product...)\n\n"; + sleep (1); +} +elsif($rrbs){ + warn "\nIf the input is a multiplexed sample with several alignments to a single position in the genome, only alignments with a unique barcode will be chosen)\n\n"; + sleep (1); +} +else{ # default; random (=first) alignment + warn "\nIf there are several alignments to a single position in the genome the first alignment will be chosen. Since the input files are not in any way sorted this is a near-enough random selection of reads.\n\n"; + sleep (1); +} + +foreach my $file (@filenames){ + + if ($global_single){ + $paired = 0; + $single = 1; + } + elsif($global_paired){ + $paired = 1; + $single = 0; + } + + unless($global_single or $global_paired){ + + warn "Trying to determine the type of mapping from the SAM header line\n"; sleep(1); + + ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file + if ($file =~ /\.gz$/){ + open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; + } + elsif ($file =~ /\.bam$/){ + open (DETERMINE,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n"; + } + else{ + open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; + } + while (){ + last unless (/^\@/); + if ($_ =~ /^\@PG/){ + # warn "found the \@PG line:\n"; + # warn "$_"; + + if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){ + warn "Treating file as paired-end data (extracted from \@PG line)\n"; sleep(1); + $paired = 1; + $single = 0; + } + else{ + warn "Treating file as single-end data (extracted from \@PG line)\n"; sleep(1); + $paired = 0; + $single = 1; + } + } + } + close DETERMINE or warn "$!\n"; + } + + if ($file =~ /(\.bam$|\.sam$)/){ + bam_isEmpty($file); + } + + + ### OPTIONS + unless ($single or $paired){ + die "Please specify either -s (single-end) or -p (paired-end) for deduplication, or provide a SAM/BAM file that contains the \@PG header line\n\n"; + } + + ### + unless ($vanilla){ + if ($paired){ + test_positional_sorting($file); + } + } + + + ### writing to a report file + my $report = $file; + + $report =~ s/\.gz$//; + $report =~ s/\.sam$//; + $report =~ s/\.bam$//; + $report =~ s/\.txt$//; + $report =~ s/$/.deduplication_report.txt/; + + open (REPORT,'>',$report) or die "Failed to write to report file to $report: $!\n\n"; + + + ### for representative methylation calls we need to discriminate between single-end and paired-end files as the latter have 2 methylation call strings + if($representative){ + deduplicate_representative($file); + } + + elsif($rrbs){ + deduplicate_barcoded_rrbs($file); + } + + ### as the default option we simply write out the first read for a position and discard all others. This is the fastest option + else{ + + my %unique_seqs; + my %positions; + + if ($file =~ /\.gz$/){ + open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; + } + elsif ($file =~ /\.bam$/){ + open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n"; + } + else{ + open (IN,$file) or die "Unable to read from $file: $!\n"; + } + + my $outfile = $file; + $outfile =~ s/\.gz$//; + $outfile =~ s/\.sam$//; + $outfile =~ s/\.bam$//; + $outfile =~ s/\.txt$//; + + if ($vanilla){ + $outfile =~ s/$/_deduplicated.txt/; + } + else{ + if ($bam == 1){ + $outfile =~ s/$/.deduplicated.bam/; + } + elsif ($bam == 2){ + $outfile =~ s/$/.deduplicated.sam.gz/; + } + else{ + $outfile =~ s/$/.deduplicated.sam/; + } + } + if ($bam == 1){ + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n"; + } + elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead + open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n"; + } + + else{ + open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n"; + } + + ### need to proceed slightly differently for the custom Bismark and Bismark SAM output + if ($vanilla){ + $_ = ; # Bismark version header + print OUT; # Printing the Bismark version to the de-duplicated file again + } + my $count = 0; + my $unique_seqs = 0; + my $removed = 0; + + while (){ + + if ($count == 0){ + if ($_ =~ /^Bismark version:/){ + warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n"; + sleep (2); + print_helpfile(); + exit; + } + } + + ### if this was a SAM file we ignore header lines + unless ($vanilla){ + if (/^\@\w{2}\t/){ + print "skipping header line:\t$_"; + print OUT "$_"; # Printing the header lines again into the de-duplicated file + next; + } + } + + ++$count; + my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths + + my ($strand,$chr,$start,$end,$cigar); + my $line1; + + if ($vanilla){ + ($strand,$chr,$start,$end) = (split (/\t/))[1,2,3,4]; + } + else{ # SAM format + ($strand,$chr,$start,$cigar) = (split (/\t/))[1,2,3,5]; # we are assigning the FLAG value to $strand + + ### SAM single-end + if ($single){ + + if ($strand == 0 ){ + ### read aligned to the forward strand. No action needed + } + elsif ($strand == 16){ + ### read is on reverse strand + + $start -= 1; # only need to adjust this once + + # for InDel free matches we can simply use the M number in the CIGAR string + if ($cigar =~ /^(\d+)M$/){ # linear match + $start += $1; + } + + else{ + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + # warn "CIGAR string; $cigar\n"; + ### determining end position of a read + foreach my $index(0..$#len){ + if ($ops[$index] eq 'M'){ # standard matching bases + $start += $len[$index]; + # warn "Operation is 'M', adding $len[$index] bp\n"; + } + elsif($ops[$index] eq 'I'){ # insertions do not affect the end position + # warn "Operation is 'I', next\n"; + } + elsif($ops[$index] eq 'D'){ # deletions do affect the end position + # warn "Operation is 'D',adding $len[$index] bp\n"; + $start += $len[$index]; + } + else{ + die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; + } + } + } + } + $composite = join (":",$strand,$chr,$start); + } + elsif($paired){ + + ### storing the current line + $line1 = $_; + + my $read_conversion; + my $genome_conversion; + + while ( /(XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XR") { + $read_conversion = $value; + $read_conversion =~ s/\r//; + chomp $read_conversion; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + chomp $genome_conversion; + } + } + die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion); + + + my $index; + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + $strand = '+'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + $strand = '-'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + $strand = '+'; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2 + if ($index == 0 or $index == 2){ + + ### reading in the next line + $_ = ; + # the only thing we need is the end position + ($end,my $cigar_2) = (split (/\t/))[3,5]; + + $end -= 1; # only need to adjust this once + + # for InDel free matches we can simply use the M number in the CIGAR string + if ($cigar_2 =~ /^(\d+)M$/){ # linear match + $end += $1; + } + else{ + # parsing CIGAR string + my @len = split (/\D+/,$cigar_2); # storing the length per operation + my @ops = split (/\d+/,$cigar_2); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops); + + # warn "CIGAR string; $cigar_2\n"; + ### determining end position of the read + foreach my $index(0..$#len){ + if ($ops[$index] eq 'M'){ # standard matching bases + $end += $len[$index]; + # warn "Operation is 'M', adding $len[$index] bp\n"; + } + elsif($ops[$index] eq 'I'){ # insertions do not affect the end position + # warn "Operation is 'I', next\n"; + } + elsif($ops[$index] eq 'D'){ # deletions do affect the end position + # warn "Operation is 'D',adding $len[$index] bp\n"; + $end += $len[$index]; + } + else{ + die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; + } + } + } + } + else{ + # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line + + $end = $start - 1; # need to adjust this only once + + # for InDel free matches we can simply use the M number in the CIGAR string + if ($cigar =~ /^(\d+)M$/){ # linear match + $end += $1; + } + else{ + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops); + + # warn "CIGAR string; $cigar\n"; + ### determining end position of the read + foreach my $index(0..$#len){ + if ($ops[$index] eq 'M'){ # standard matching bases + $end += $len[$index]; + # warn "Operation is 'M', adding $len[$index] bp\n"; + } + elsif($ops[$index] eq 'I'){ # insertions do not affect the end position + # warn "Operation is 'I', next\n"; + } + elsif($ops[$index] eq 'D'){ # deletions do affect the end position + # warn "Operation is 'D',adding $len[$index] bp\n"; + $end += $len[$index]; + } + else{ + die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; + } + } + } + + ### reading in the next line + $_ = ; + # the only thing we need is the start position + ($start) = (split (/\t/))[3]; + } + $composite = join (":",$strand,$chr,$start,$end); + } + + else{ + die "Input must be single or paired-end\n"; + } + } + + if (exists $unique_seqs{$composite}){ + ++$removed; + unless (exists $positions{$composite}){ + $positions{$composite}++; + } + } + else{ + if ($paired){ + unless ($vanilla){ + print OUT "$line1"; # printing first paired-end line for SAM output + } + } + print OUT "$_"; # printing single-end SAM alignment or second paired-end line + $unique_seqs{$composite}++; + } + } + + my $percentage; + my $percentage_leftover; + my $leftover = $count - $removed; + + unless ($count == 0){ + $percentage = sprintf("%.2f",$removed/$count*100); + $percentage_leftover = sprintf("%.2f",$leftover/$count*100); + } + else{ + $percentage = 'N/A'; + $percentage_leftover = 'N/A'; + } + + warn "\nTotal number of alignments analysed in $file:\t$count\n"; + warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n"; + warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n"; + warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n"; + + print REPORT "\nTotal number of alignments analysed in $file:\t$count\n"; + print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n"; + print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n"; + print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n"; + } + + close OUT or warn "Failed to close output filehandle: $!\n"; + close REPORT or warn "Failed to close report filehandle: $!\n"; + +} + + +sub deduplicate_barcoded_rrbs{ + + my $file = shift; + + my %unique_seqs; + my %positions; + + if ($file =~ /\.gz$/){ + open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; + } + elsif ($file =~ /\.bam$/){ + open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n"; + } + else{ + open (IN,$file) or die "Unable to read from $file: $!\n"; + } + + my $outfile = $file; + $outfile =~ s/\.gz$//; + $outfile =~ s/\.sam$//; + $outfile =~ s/\.bam$//; + $outfile =~ s/\.txt$//; + + if ($vanilla){ + $outfile =~ s/$/_dedup_RRBS.txt/; + } + else{ + if ($bam == 1){ + $outfile =~ s/$/.dedup_RRBS.bam/; + } + elsif ($bam == 2){ + $outfile =~ s/$/.dedupRRBS.sam.gz/; + } + else{ + $outfile =~ s/$/.dedup_RRBS.sam/; + } + } + if ($bam == 1){ + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n"; + } + elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead + open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ + open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n"; + } + + ### This mode only supports Bismark SAM output + my $count = 0; + my $unique_seqs = 0; + my $removed = 0; + + while (){ + + if ($count == 0){ + if ($_ =~ /^Bismark version:/){ + warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n"; + sleep (2); + print_helpfile(); + exit; + } + } + + ### if this was a SAM file we ignore header lines + if (/^\@\w{2}\t/){ + warn "skipping SAM header line:\t$_"; + print OUT; # Printing the header lines again into the de-duplicated file + next; + } + + ++$count; + my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths + ### in this barcoded mode we also store the read barcode as additional means of assisting the deduplication + ### in effect the $composite string looks like this (separated by ':'): + + ### FLAG:chromosome:start:barcode + + my $end; + my $line1; + + # SAM format + my ($id,$strand,$chr,$start,$cigar) = (split (/\t/))[0,1,2,3,5]; # we are assigning the FLAG value to $strand + + $id =~ /:(\w+)$/; + my $barcode = $1; + unless ($barcode){ + die "Failed to extract a barcode from the read ID (last element of each read ID needs to be the barcode sequence, e.g. ':CATG'\n\n"; + } + + ### SAM single-end + if ($single){ + + if ($strand == 0 ){ + ### read aligned to the forward strand. No action needed + } + elsif ($strand == 16){ + ### read is on reverse strand + + $start -= 1; # only need to adjust this once + + # for InDel free matches we can simply use the M number in the CIGAR string + if ($cigar =~ /^(\d+)M$/){ # linear match + $start += $1; + } + else{ + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + # warn "CIGAR string; $cigar\n"; + ### determining end position of a read + foreach my $index(0..$#len){ + if ($ops[$index] eq 'M'){ # standard matching bases + $start += $len[$index]; + # warn "Operation is 'M', adding $len[$index] bp\n"; + } + elsif($ops[$index] eq 'I'){ # insertions do not affect the end position + # warn "Operation is 'I', next\n"; + } + elsif($ops[$index] eq 'D'){ # deletions do affect the end position + # warn "Operation is 'D',adding $len[$index] bp\n"; + $start += $len[$index]; + } + else{ + die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; + } + } + } + } + + ### Here we take the barcode sequence into consideration + $composite = join (":",$strand,$chr,$start,$barcode); + # warn "$composite\n\n"; + # sleep(1); + } + elsif($paired){ + + ### storing the current line + $line1 = $_; + + my $read_conversion; + my $genome_conversion; + + while ( /(XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XR") { + $read_conversion = $value; + $read_conversion =~ s/\r//; + chomp $read_conversion; + } + elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + chomp $genome_conversion; + } + } + die "Failed to determine read and genome conversion from line: $line1\n\n" unless ($read_conversion and $read_conversion); + + + my $index; + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + $strand = '+'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + $strand = '-'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + $strand = '+'; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2 + if ($index == 0 or $index == 2){ + + ### reading in the next line + $_ = ; + # the only thing we need is the end position + ($end,my $cigar_2) = (split (/\t/))[3,5]; + + $end -= 1; # only need to adjust this once + + # for InDel free matches we can simply use the M number in the CIGAR string + if ($cigar_2 =~ /^(\d+)M$/){ # linear match + $end += $1; + } + else{ + # parsing CIGAR string + my @len = split (/\D+/,$cigar_2); # storing the length per operation + my @ops = split (/\d+/,$cigar_2); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops); + + # warn "CIGAR string; $cigar_2\n"; + ### determining end position of the read + foreach my $index(0..$#len){ + if ($ops[$index] eq 'M'){ # standard matching bases + $end += $len[$index]; + # warn "Operation is 'M', adding $len[$index] bp\n"; + } + elsif($ops[$index] eq 'I'){ # insertions do not affect the end position + # warn "Operation is 'I', next\n"; + } + elsif($ops[$index] eq 'D'){ # deletions do affect the end position + # warn "Operation is 'D',adding $len[$index] bp\n"; + $end += $len[$index]; + } + else{ + die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; + } + } + } + } + else{ + # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line + + $end = $start - 1; # need to adjust this only once + + # for InDel free matches we can simply use the M number in the CIGAR string + if ($cigar =~ /^(\d+)M$/){ # linear match + $end += $1; + } + else{ + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops); + + # warn "CIGAR string; $cigar\n"; + ### determining end position of the read + foreach my $index(0..$#len){ + if ($ops[$index] eq 'M'){ # standard matching bases + $end += $len[$index]; + # warn "Operation is 'M', adding $len[$index] bp\n"; + } + elsif($ops[$index] eq 'I'){ # insertions do not affect the end position + # warn "Operation is 'I', next\n"; + } + elsif($ops[$index] eq 'D'){ # deletions do affect the end position + # warn "Operation is 'D',adding $len[$index] bp\n"; + $end += $len[$index]; + } + else{ + die "Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n"; + } + } + } + + ### reading in the next line + $_ = ; + # the only thing we need is the start position + ($start) = (split (/\t/))[3]; + } + + ### Here we take the barcode sequence into consideration + $composite = join (":",$strand,$chr,$start,$end,$barcode); + } + else{ + die "Input must be single or paired-end\n"; + } + + if (exists $unique_seqs{$composite}){ + ++$removed; + unless (exists $positions{$composite}){ + $positions{$composite}++; + } + } + else{ + if ($paired){ + print OUT $line1; # printing first paired-end line for SAM output + } + print OUT; # printing single-end SAM alignment or second paired-end line + $unique_seqs{$composite}++; + } + } + + my $percentage; + my $percentage_leftover; + my $leftover = $count - $removed; + + unless ($count == 0){ + $percentage = sprintf("%.2f",$removed/$count*100); + $percentage_leftover = sprintf("%.2f",$leftover/$count*100); + } + else{ + $percentage = 'N/A'; + $percentage_leftover = 'N/A'; + } + + + warn "\nTotal number of alignments analysed in $file:\t$count\n"; + warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n"; + warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n"; + warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n"; + + print REPORT "\nTotal number of alignments analysed in $file:\t$count\n"; + print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n"; + print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n"; + print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n"; + +} + +sub bam_isEmpty{ + + my $file = shift; + + if ($file =~ /\.bam$/){ + open (EMPTY,"$samtools_path view $file |") or die "Unable to read from BAM file $file: $!\n"; + } + else{ + open (EMPTY,$file) or die "Unable to read from $file: $!\n"; + } + my $count = 0; + while (){ + if ($_){ + $count++; # file contains data, fine. + } + last; # one line is enough + } + + if ($count == 0){ + die "\n### File appears to be empty, terminating deduplication process. Please make sure the input file has not been truncated. ###\n\n"; + } + close EMPTY or warn "$!\n"; +} + + +sub print_helpfile{ + print "\n",'='x111,"\n"; + print "\nThis script is supposed to remove alignments to the same position in the genome from the Bismark mapping output\n(both single and paired-end SAM files), which can arise by e.g. excessive PCR amplification. If sequences align\nto the same genomic position but on different strands they will be scored individually.\n\nNote that deduplication is not recommended for RRBS-type experiments!\n\nIn the default mode, the first alignment to a given position will be used irrespective of its methylation call\n(this is the fastest option, and as the alignments are not ordered in any way this is also near enough random).\n\n"; + print "For single-end alignments only use the start coordinate of a read will be used for deduplication.\n\n"; + print "For paired-end alignments the start-coordinate of the first read and the end coordinate of the second\nread will be used for deduplication. "; + print "This script expects the Bismark output to be in SAM format\n(Bismark v0.6.x or higher). To deduplicate the old custom Bismark output please specify '--vanilla'.\n\n"; + print "*** Please note that for paired-end BAM files the deduplication script expects Read1 and Read2 to\nfollow each other in consecutive lines! If the file has been sorted by position make sure that you resort it\nby read name first (e.g. using samtools sort -n) ***\n\n"; + + print '='x111,"\n\n"; + print ">>> USAGE: ./deduplicate_bismark_alignment_output.pl [options] filename(s) <<<\n\n"; + + print "-s/--single\t\tdeduplicate single-end Bismark files (default format: SAM)\n"; + print "-p/--paired\t\tdeduplicate paired-end Bismark files (default format: SAM)\n\n"; + print "--vanilla\t\tThe input file is in the old custom Bismark format and not in SAM format\n\n"; + print "--barcode\t\tIn addition to chromosome, start position and orientation this will also take a potential barcode into\n consideration while deduplicating. The barcode needs to be the last element of the read ID and separated\n by a ':', e.g.: MISEQ:14:000000000-A55D0:1:1101:18024:2858_1:N:0:CTCCT\n\n"; + print "--bam\t\t\tThe output will be written out in BAM format instead of the default SAM format. This script will\n\t\t\tattempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't\n\t\t\tbeen specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,\n\t\t\tthe SAM output will be compressed with GZIP instead (yielding a .sam.gz output file)\n\n"; + print "--samtools_path\t\tThe path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified\n\t\t\texplicitly if Samtools is in the PATH already\n\n"; + print "--version\t\tPrint version information and exit\n"; + + print '='x111,"\n\n"; + + print "This script was last modified on November 04, 2015\n\n"; +} + + + + +sub test_positional_sorting{ + + my $filename = shift; + + print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; + sleep(1); + + if ($filename =~ /\.gz$/) { + open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam + if ($samtools_path){ + open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (TEST,$filename) or die "Can't open file $filename: $!\n"; + } + + my $count = 0; + + while () { + if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) + if (/^\@SO/) { + die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n"; + } + next; + } + $count++; + + last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID + + my ($id_1) = (split (/\t/)); + + ### reading the next line which should be read 2 + $_ = ; + my ($id_2) = (split (/\t/)); + last unless ($id_2); + ++$count; + + if ($id_1 eq $id_2){ + ### ids are the same + next; + } + else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first + my $id_1_trunc = $id_1; + $id_1_trunc =~ s/\/1$//; + my $id_2_trunc = $id_2; + $id_2_trunc =~ s/\/2$//; + + unless ($id_1_trunc eq $id_2_trunc){ + die "\nThe IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n"; + } + } + } + # close TEST or die $!; somehow fails on our cluster... + ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) + warn "...passed!\n"; + sleep(1); + +} + +sub isBam{ + + my $filename = shift; + + # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy) + open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n"; + + ### when BAM files read through a gunzip -c stream they start with BAM... + my $bam_in_disguise = ; + # warn "BAM in disguise: $bam_in_disguise\n\n"; + + if ($bam_in_disguise){ + if ($bam_in_disguise =~ /^BAM/){ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 1; + } + else{ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 0; + } + } + else{ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 0; + } +} + + + + + +##################################### + +### The following subroutine "deduplicate_representative" only works correctly for reads that do not contain any indels (as in only Bowtie1-based alignments). +### Please refrain from using it unless you want to test something out. + +##################################### + +sub deduplicate_representative { + my $file = shift; + + my %positions; + my %unique_seqs; + + my $count = 0; + my $unique_seqs = 0; + my $removed = 0; + + ### going through the file first and storing all positions as well as their methylation call strings in a hash + if ($file =~ /\.gz$/){ + open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; + } + elsif ($file =~ /\.bam$/){ + open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n"; + } + else{ + open (IN,$file) or die "Unable to read from $file: $!\n"; + } + + if ($single){ + + my $outfile = $file; + $outfile =~ s/\.gz$//; + $outfile =~ s/\.sam$//; + $outfile =~ s/\.bam$//; + $outfile =~ s/\.txt$//; + + if ($vanilla){ + $outfile =~ s/$/.deduplicated_to_representative_sequences.txt/; + } + else{ + if ($bam == 1){ + $outfile =~ s/$/.deduplicated_to_representative_sequences.bam/; + } + elsif ($bam == 2){ + $outfile =~ s/$/.deduplicated_to_representative_sequences.sam.gz/; + } + else{ + $outfile =~ s/$/.deduplicated_to_representative_sequences.sam/; + } + } + + if ($bam == 1){ + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n"; + } + elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead + open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ + open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n"; + } + + warn "Reading and storing all alignment positions\n"; + + ### need to proceed slightly differently for the custom Bismark and Bismark SAM output + if ($vanilla){ + $_ = ; # Bismark version header + print OUT; # Printing the Bismark version to the de-duplicated file again + } + + while (){ + + if ($count == 0){ + if ($_ =~ /^Bismark version:/){ + warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n"; + sleep (2); + print_helpfile(); + exit; + } + } + + ### if this was a SAM file we ignore header lines + unless ($vanilla){ + if (/^\@\w{2}\t/){ + warn "skipping SAM header line:\t$_"; + print OUT; # Printing the header lines again into the de-duplicated file + next; + } + } + + my ($strand,$chr,$start,$end,$meth_call); + + if ($vanilla){ + ($strand,$chr,$start,$end,$meth_call) = (split (/\t/))[1,2,3,4,7]; + } + else{ # SAM format + + ($strand,$chr,$start,my $seq,$meth_call) = (split (/\t/))[1,2,3,9,13]; # we are assigning the FLAG value to $strand + ### SAM single-end + $end = $start + length($seq) - 1; + } + + my $composite = join (":",$strand,$chr,$start,$end); + + $count++; + $positions{$composite}->{$meth_call}->{count}++; + $positions{$composite}->{$meth_call}->{alignment} = $_; + } + warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n"; + close IN or warn $!; + } + + elsif ($paired){ + + ### we are going to concatenate both methylation call strings from the paired end file to form a joint methylation call string + + my $outfile = $file; + if ($vanilla){ + $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.txt/; + } + else{ + $outfile =~ s/$/_deduplicated_to_representative_sequences_pe.sam/; + } + + open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n"; + warn "Reading and storing all alignment positions\n"; + + ### need to proceed slightly differently for the custom Bismark and Bismark SAM output + if ($vanilla){ + $_ = ; # Bismark version header + print OUT; # Printing the Bismark version to the de-duplicated file again + } + + while (){ + + if ($count == 0){ + if ($_ =~ /^Bismark version:/){ + warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n"; + sleep (2); + print_helpfile(); + exit; + } + } + + ### if this was a SAM file we ignore header lines + unless ($vanilla){ + if (/^\@\w{2}\t/){ + warn "skipping SAM header line:\t$_"; + print OUT; # Printing the header lines again into the de-duplicated file + next; + } + } + + my ($strand,$chr,$start,$end,$meth_call_1,$meth_call_2); + my $line1; + + if ($vanilla){ + ($strand,$chr,$start,$end,$meth_call_1,$meth_call_2) = (split (/\t/))[1,2,3,4,7,10]; + } + else{ # SAM paired-end format + + ($strand,$chr,$start,$meth_call_1) = (split (/\t/))[1,2,3,13]; # we are assigning the FLAG value to $strand + + ### storing the first line (= read 1 alignment) + $line1 = $_; + + ### reading in the next line + $_ = ; + # we only need the end position and the methylation call + (my $pos,my $seq_2,$meth_call_2) = (split (/\t/))[3,9,13]; + $end = $pos + length($seq_2) - 1; + } + + my $composite = join (":",$strand,$chr,$start,$end); + $count++; + my $meth_call = $meth_call_1.$meth_call_2; + + $positions{$composite}->{$meth_call}->{count}++; + if ($vanilla){ + $positions{$composite}->{$meth_call}->{alignment} = $_; + } + else{ # SAM PAIRED-END + $positions{$composite}->{$meth_call}->{alignment_1} = $line1; + $positions{$composite}->{$meth_call}->{alignment_2} = $_; + } + } + warn "Stored ",scalar keys %positions," different positions for $count sequences in total (+ and - alignments to the same position are scored individually)\n\n"; + close IN or warn $!; + } + + ### PRINTING RESULTS + + ### Now going through all stored positions and printing out the methylation call which is most representative, i.e. the one which occurred most often + warn "Now printing out alignments with the most representative methylation call(s)\n"; + + foreach my $pos (keys %positions){ + foreach my $meth_call (sort { $positions{$pos}->{$b}->{count} <=> $positions{$pos}->{$a}->{count} }keys %{$positions{$pos}}){ + if ($paired){ + if ($vanilla){ + print OUT $positions{$pos}->{$meth_call}->{alignment}; + } + else{ + print OUT $positions{$pos}->{$meth_call}->{alignment_1}; # SAM read 1 + print OUT $positions{$pos}->{$meth_call}->{alignment_2}; # SAM read 2 + } + } + else{ # single-end + print OUT $positions{$pos}->{$meth_call}->{alignment}; + } + $unique_seqs++; + last; ### exiting once we printed a sequence with the most frequent methylation call for a position + } + } + + my $percentage; + unless ($count == 0){ + $percentage = sprintf ("%.2f",$unique_seqs*100/$count); + } + else{ + $percentage = 'N/A'; + } + + warn "\nTotal number of alignments analysed in $file:\t$count\n"; + warn "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n"; + print REPORT "\nTotal number of alignments analysed in $file:\t$count\n"; + print REPORT "Total number of representative alignments printed from $file in total:\t$unique_seqs ($percentage%)\n\n"; + +} + + + diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_genome_preparation --- a/bismark_genome_preparation Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,459 +0,0 @@ -#!/usr/bin/perl -- -use strict; -use warnings; -use Cwd; -use File::Path qw(rmtree); -$|++; - - -## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) - -## This program is free software: you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation, either version 3 of the License, or -## (at your option) any later version. - -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. - -## You should have received a copy of the GNU General Public License -## along with this program. If not, see . - -use Getopt::Long; -use Cwd; - -my $verbose; -my $help; -my $version; -my $man; -my $path_to_bowtie; -my $multi_fasta; -my $single_fasta; -my $bowtie2; - -my $bismark_version = 'v0.10.0'; - -GetOptions ('verbose' => \$verbose, - 'help' => \$help, - 'man' => \$man, - 'version' => \$version, - 'path_to_bowtie:s' => \$path_to_bowtie, - 'single_fasta' => \$single_fasta, - 'bowtie2' => \$bowtie2, - ); - -if ($help or $man){ - print_helpfile(); - exit; -} - -if ($version){ - print << "VERSION"; - - Bismark - Bisulfite Mapper and Methylation Caller. - - Bismark Genome Preparation Version: $bismark_version - Copyright 2010-13 Felix Krueger, Babraham Bioinformatics - www.bioinformatics.babraham.ac.uk/projects/ - -VERSION - exit; -} - -my $genome_folder = shift @ARGV; # mandatory - -# Ensuring a genome folder has been specified -if ($genome_folder){ - unless ($genome_folder =~ /\/$/){ - $genome_folder =~ s/$/\//; - } - $verbose and print "Path to genome folder specified as: $genome_folder\n"; - chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; - - # making the genome folder path abolsolute so it won't break if the path was specified relative - $genome_folder = getcwd; - unless ($genome_folder =~ /\/$/){ - $genome_folder =~ s/$/\//; - } -} -else{ - die "Please specify a genome folder to be used for bisulfite conversion\n\n"; -} - - -my $CT_dir; -my $GA_dir; - - -if ($single_fasta){ - print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n"; - $multi_fasta = 0; -} -else{ - print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n"; - $single_fasta = 0; - $multi_fasta = 1; -} - -my @filenames = create_bisulfite_genome_folders(); - -process_sequence_files (); - -launch_bowtie_indexer(); - -sub launch_bowtie_indexer{ - if ($bowtie2){ - print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n"; - } - else{ - print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n"; - } - print "Please be aware that this process can - depending on genome size - take up to several hours!\n"; - sleep(5); - - ### if the path to bowtie was specfified explicitely - if ($path_to_bowtie){ - if ($bowtie2){ - $path_to_bowtie =~ s/$/bowtie2-build/; - } - else{ - $path_to_bowtie =~ s/$/bowtie-build/; - } - } - ### otherwise we assume that bowtie-build is in the path - else{ - if ($bowtie2){ - $path_to_bowtie = 'bowtie2-build'; - } - else{ - $path_to_bowtie = 'bowtie-build'; - } - } - - $verbose and print "\n"; - - ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer) - my $pid = fork(); - - # parent process - if ($pid){ - sleep(1); - chdir $CT_dir or die "Unable to change directory: $!\n"; - $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n"; - my @fasta_files = <*.fa>; - my $file_list = join (',',@fasta_files); - $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n"; - $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n"; - - sleep (11); - exec ("$path_to_bowtie","-f","$file_list","BS_CT"); - } - - # child process - elsif ($pid == 0){ - sleep(2); - chdir $GA_dir or die "Unable to change directory: $!\n"; - $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n"; - my @fasta_files = <*.fa>; - my $file_list = join (',',@fasta_files); - $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n"; - $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n"; - $verbose and print "(starting in 10 seconds)\n"; - sleep(10); - exec ("$path_to_bowtie","-f","$file_list","BS_GA"); - } - - # if the platform doesn't support the fork command we will run the indexing processes one after the other - else{ - print "Forking process was not successful, therefore performing the indexing sequentially instead\n"; - sleep(10); - - ### moving to CT genome folder - $verbose and warn "Preparing to index CT converted genome in $CT_dir\n"; - chdir $CT_dir or die "Unable to change directory: $!\n"; - my @fasta_files = <*.fa>; - my $file_list = join (',',@fasta_files); - $verbose and print "$file_list\n\n"; - sleep(2); - system ("$path_to_bowtie","-f","$file_list","BS_CT"); - @fasta_files=(); - $file_list= ''; - - ### moving to GA genome folder - $verbose and warn "Preparing to index GA converted genome in $GA_dir\n"; - chdir $GA_dir or die "Unable to change directory: $!\n"; - @fasta_files = <*.fa>; - $file_list = join (',',@fasta_files); - $verbose and print "$file_list\n\n"; - sleep(2); - exec ("$path_to_bowtie","-f","$file_list","BS_GA"); - } -} - - -sub process_sequence_files { - - my ($total_CT_conversions,$total_GA_conversions) = (0,0); - $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n"; - sleep (3); - - $verbose and print "conversions performed:\n"; - $verbose and print join("\t",'chromosome','C->T','G->A'),"\n"; - - - ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file - ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle - ### This is now the default option - - if ($multi_fasta){ - ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files - my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa"; - open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; - - my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa"; - open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; - } - - foreach my $filename(@filenames){ - my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); - open (IN,$filename) or die "Failed to read from sequence file $filename $!\n"; - # warn "Reading chromosome information from $filename\n\n"; - - ### first line needs to be a fastA header - my $first_line = ; - chomp $first_line; - - ### Extracting chromosome name from the FastA header - my $chromosome_name = extract_chromosome_name($first_line); - - ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. - unless ($multi_fasta){ - my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; - $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; - open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; - - my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; - $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; - open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; - } - - print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry - print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry - - - while (){ - - ### in case the line is a new fastA header - if ($_ =~ /^>/){ - ### printing out the stats for the previous chromosome - $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; - ### resetting the chromosome transliteration counters - ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); - - ### Extracting chromosome name from the additional FastA header - $chromosome_name = extract_chromosome_name($_); - - ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. - unless ($multi_fasta){ - my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; - $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; - open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; - - my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; - $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; - open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; - } - - print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; - print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; - } - - else{ - my $sequence = uc$_; - - ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group) - - $sequence =~ s/[^ATCGN\n\r]/N/g; - - ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion) - - my $CT_sequence = $sequence; - my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts - $total_CT_conversions += $CT_transliterations_performed; - $chromosome_CT_conversions += $CT_transliterations_performed; - - print CT_CONVERT $CT_sequence; - - ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse- - ### complementing the forward strand and then C->T converting it) - - my $GA_sequence = $sequence; - my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand - $total_GA_conversions += $GA_transliterations_performed; - $chromosome_GA_conversions += $GA_transliterations_performed; - - print GA_CONVERT $GA_sequence; - - } - } - $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; - } - close (CT_CONVERT) or die "Failed to close filehandle: $!\n"; - close (GA_CONVERT) or die "Failed to close filehandle: $!\n"; - - - print "\nTotal number of conversions performed:\n"; - print "C->T:\t$total_CT_conversions\n"; - print "G->A:\t$total_GA_conversions\n"; - - warn "\nStep II - Genome bisulfite conversions - completed\n\n\n"; -} - -sub extract_chromosome_name { - - my $header = shift; - - ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well - - if ($header =~ s/^>//){ - my ($chromosome_name) = split (/\s+/,$header); - return $chromosome_name; - } - else{ - die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n"; - } -} - -sub create_bisulfite_genome_folders{ - - $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n"; - - if ($path_to_bowtie){ - unless ($path_to_bowtie =~ /\/$/){ - $path_to_bowtie =~ s/$/\//; - } - if (chdir $path_to_bowtie){ - if ($bowtie2){ - $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n"; - } - else{ - $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n"; - } - } - else{ - die "There was an error with the path to bowtie: $!\n"; - } - } - - chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; - - - # Exiting unless there are fastA files in the folder - my @filenames = <*.fa>; - - ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta - unless (@filenames){ - @filenames = <*.fasta>; - } - - unless (@filenames){ - die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n"; - } - - warn "Bisulfite Genome Indexer version $bismark_version (last modified 19 Sept 2013)\n\n"; - sleep (3); - - # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists - my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/"; - unless (-d $bisulfite_dir){ - mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n"; - $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n"; - } - else{ - print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n"; - sleep(5); - } - - chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n"; - $CT_dir = "${bisulfite_dir}CT_conversion/"; - $GA_dir = "${bisulfite_dir}GA_conversion/"; - - # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion) - # converted version of the genome - unless (-d $CT_dir){ - mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n"; - $verbose and print "Created Bisulfite Genome folder $CT_dir\n"; - } - unless (-d $GA_dir){ - mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n"; - $verbose and print "Created Bisulfite Genome folder $GA_dir\n"; - } - - # moving back to the original genome folder - chdir $genome_folder or die "Could't move to directory $genome_folder $!"; - # $verbose and print "Moved back to genome folder folder $genome_folder\n"; - warn "\nStep I - Prepare genome folders - completed\n\n\n"; - return @filenames; -} - -sub print_helpfile{ - print << 'HOW_TO'; - - -DESCRIPTION - -This script is supposed to convert a specified reference genome into two different bisulfite -converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first -bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs -converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference -genome folder. Once the bisulfite conversion has been completed the program will fork and launch -two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware -that the indexing process can take up to several hours; this will mainly depend on genome size -and system resources. - - - -The following is a brief description of command line options and arguments to control the -Bismark Genome Preparation: - - -USAGE: bismark_genome_preparation [options] - - -OPTIONS: - ---help/--man Displays this help filea and exits. - ---version Displays version information and exits. - ---verbose Print verbose output for more details or debugging. - ---path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system - (depending on which aligner/indexer you intend to use). Unless this path - is specified it is assumed that Bowtie is in the PATH. - ---bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1). - ---single_fasta Instruct the Bismark Indexer to write the converted genomes into - single-entry FastA files instead of making one multi-FastA file (MFA) - per chromosome. This might be useful if individual bisulfite converted - chromosomes are needed (e.g. for debugging), however it can cause a - problem with indexing if the number of chromosomes is vast (this is likely - to be in the range of several thousand files; the operating system can - only handle lists up to a certain length, and some newly assembled - genomes may contain 20000-50000 contigs of scaffold files which do exceed - this list length limit). - - -ARGUMENTS: - - The path to the folder containing the genome to be bisulfite converted. - The Bismark Genome Preparation expects one or more fastA files in the folder - (with the file extension: .fa or .fasta). Specifying this path is mandatory. - - -This script was last modified on 19 Sept 2013. -HOW_TO -} diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_mapping/bismark --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_mapping/bismark Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,10220 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use IO::Handle; +use Cwd; +$|++; +use Getopt::Long; +use FindBin qw($Bin); +use lib "$Bin/../lib"; + +## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + + +my $parent_dir = getcwd; +my $bismark_version = 'v0.16.3'; +my $command_line = join (" ",@ARGV); + + +### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail +foreach my $arg (@ARGV){ + if ($arg eq '--solexa1.3-quals'){ + $arg = '--phred64-quals'; + } +} +my @filenames; # will be populated by processing the command line + +my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index,$multicore,$rg_tag,$rg_id,$rg_sample,$ambig_bam,$cram,$cram_ref,$nucleotide_coverage,$dovetail) = process_command_line(); + +my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment +my %chromosomes; # stores the chromosome sequences of the mouse genome +my %SQ_order; # stores the order of sequences in the reference. This is to produce SAM/BAM files with a known order of chromosomes +my %counting; # counting various events +my $final_output_filename; # required for the nucleotide coverage report +my @pids; # storing the process IDs of child processes in parallel mode + + +my $seqID_contains_tabs; +my $verbose = 0; + +if ($multicore > 1){ + warn "Running Bismark Parallel version. Number of parallel instances to be spawned: $multicore\n\n"; +} + + +sub multi_process_handling{ + + my $offset = 1; + my $process_id; + if ($multicore > 1){ + + until ($offset == $multicore){ + # warn "multicore: $multicore\noffset: $offset\n"; + my $fork = fork; + + if (defined $fork){ + if ($fork != 0){ + $process_id = $fork; + push @pids, $process_id; + if ($offset < $multicore){ + ++$offset; + # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n"; + } + else{ + # warn "Reached the number of maximum multicores. Proceeeding to processing...\n"; + } + } + elsif ($fork == 0){ + # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n"; + $process_id = $fork; + last; + } + } + else{ + die "Forking unsuccessful. Proceeding using a single thread only\n"; + } + } + + # warn "\nThe Thread Identity\n===================\n"; + if ($process_id){ + # print "I am the parent process. My children are called:\n"; + # print join ("\t",@pids),"\n"; + # print "I am going to process the following line count: $offset\n\n"; + } + elsif($process_id == 0){ + # warn "I am a child process: Process ID: $process_id\n"; + # warn "I am going to process the following line count: $offset\n\n"; + } + else{ + die "Process ID was: '$process_id'\n"; + } + } + else{ + warn "Single-core mode: setting pid to 1\n"; + $process_id = 1; + } + + return ($process_id,$offset); +} + + +sub subset_input_file_FastQ{ + + my ($filename,$process_id,$offset) = @_; + + if ($filename =~ /gz$/){ + open (OFFSET,"gunzip -c $filename |") or die "Couldn't read from file '$filename': $!\n"; + } + else{ + open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n"; + } + + # warn "offset is $offset\n"; + my $temp = $filename; + $temp .= ".temp.$offset"; + $temp =~ s/^.*\///; # replacing everything upto and including the last /, i.e. removing file path information + + if ($gzip){ + $temp .= '.gz'; + open (TEMPFQ,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n"; + } + else{ + open (TEMPFQ,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n"; + } + + my $line_count = 0; + + while (1){ + my $l1 = ; + my $l2 = ; + my $l3 = ; + my $l4 = ; + + last unless ($l4); + ++$line_count; + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + print TEMPFQ "$l1$l2$l3$l4"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + } + + close OFFSET or warn $!; + close TEMPFQ or warn "Failed to close file handle TEMPFQ: $!\n"; + + warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n"; + + return ($temp); # returning the subset filename + +} + +sub subset_input_file_FastA{ + + my ($filename,$process_id,$offset) = @_; + + if ($filename =~ /gz$/){ + open (OFFSET,"gunzip -c $filename |") or die "Couldn't read from file '$filename': $!\n"; + } + else{ + open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n"; + } + + # warn "offset is $offset\n"; + my $temp = $filename; + $temp .= ".temp.$offset"; + $temp =~ s/^.*\///; # replacing everything upto and including the last /, i.e. removing file path information + + if ($gzip){ + $temp .= '.gz'; + open (TEMPFA,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n"; + } + else{ + open (TEMPFA,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n"; + } + + warn "Writing temporary infile to $temp\n"; + + my $line_count = 0; + + while (1){ + my $l1 = ; + my $l2 = ; + + last unless ($l2); + ++$line_count; + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + print TEMPFA "$l1$l2"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + } + + close OFFSET or warn $!; + close TEMPFA or warn "Failed to close file handle TEMPFQ: $!\n"; + + warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n"; + + return ($temp); # returning the subset filename + +} + +##### +##### + +foreach my $filename (@filenames){ + + my $original_filename = $filename; + my $original_filename_1; + my $original_filename_2; + + chdir $parent_dir or die "Unable to move to initial working directory'$parent_dir' $!\n"; + ### resetting the counting hash and fhs + reset_counters_and_fhs($filename); + @pids = (); + $seqID_contains_tabs = 0; + + ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time + unless (%chromosomes){ + my $cwd = getcwd; # storing the path of the current working directory + warn "Current working directory is: $cwd\n\n"; + read_genome_into_memory($cwd); + } + + ### As of version 0.14.0 we support multi-threading. In a first instance we accomplish this by + ### splitting the input file(s) into several smaller subfiles and merging the results back at + ### the end. + + # get general settings (also for single-threaded use) + my ($pid,$offset) = multi_process_handling (); + + my ($single_end,$paired_end); + ### PAIRED-END ALIGNMENTS + if ($filename =~ ','){ + + $single_end = 0; + $paired_end = 1; + + my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file + + $fhs[0]->{name} = 'CTread1GAread2CTgenome'; + $fhs[1]->{name} = 'GAread1CTread2GAgenome'; + $fhs[2]->{name} = 'GAread1CTread2CTgenome'; + $fhs[3]->{name} = 'CTread1GAread2GAgenome'; + warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n"; + + my ($filename_1,$filename_2) = (split (/,/,$filename)); + $original_filename_1 = $filename_1; + $original_filename_2 = $filename_2; + + warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n"; + + ### subsetting the input file(s) + unless ($multicore == 1){ # not needed in single-core mode + # warn "My PID: $pid\nMy offset: $offset\n"; + if ($sequence_file_format eq 'FASTA'){ + my $temp_filename_1 = subset_input_file_FastA($filename_1,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n"; + $filename_1 = "${temp_dir}$temp_filename_1"; + + my $temp_filename_2 = subset_input_file_FastA($filename_2,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n"; + $filename_2 = "${temp_dir}$temp_filename_2"; + } + else{ # FastQ format, default + my $temp_filename_1 = subset_input_file_FastQ($filename_1,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n"; + $filename_1 = "${temp_dir}$temp_filename_1"; + + my $temp_filename_2 = subset_input_file_FastQ($filename_2,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n"; + $filename_2 = "${temp_dir}$temp_filename_2"; + } + } + + ### additional variables only for paired-end alignments + my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file + + ### FastA format + if ($sequence_file_format eq 'FASTA'){ + warn "Input files are in FastA format\n"; + + if ($directional){ + ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + + if ($bowtie2){ + paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + else{ + paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + } + + ### FastQ format + else{ + warn "Input files are in FastQ format\n"; + if ($directional){ + if ($bowtie2){ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ # Bowtie 1 alignments + if ($gzip){ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format + $fhs[0]->{inputfile_2} = undef; # no longer needed + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format + $fhs[3]->{inputfile_2} = undef; # no longer needed + } + else{ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + } + } + elsif($pbat){ # PBAT-Seq. This works for both Bowtie and Bowtie 2 + ### At the moment we are only performing alignments only with uncompressed FastQ files + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = undef; + $fhs[0]->{inputfile_2} = undef; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = undef; + $fhs[3]->{inputfile_2} = undef; + } + else{ + if ($bowtie2){ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ # Bowtie 1 alignments + if ($gzip){ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files + } + else{ # uncompressed temp files + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + } + } + if ($bowtie2){ + paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + else{ + paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + } + start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + } + + ### Else we are performing SINGLE-END ALIGNMENTS + else{ + warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n"; + + $single_end = 1; + $paired_end = 0; + + ### subsetting the input file(s) + unless ($multicore == 1){ # not needed in single-core mode + # warn "My PID: $pid\nMy offset: $offset\n"; + if ($sequence_file_format eq 'FASTA'){ + my $temp_filename = subset_input_file_FastA($filename,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n"; + $filename = "${temp_dir}$temp_filename"; + } + else{ # FastQ format, default + my $temp_filename = subset_input_file_FastQ($filename,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n"; + $filename = "${temp_dir}$temp_filename"; + } + } + + ### Initialising bisulfite conversion filenames + my ($C_to_T_infile,$G_to_A_infile); + + ### FastA format + if ($sequence_file_format eq 'FASTA'){ + warn "Input file is in FastA format\n"; + if ($directional){ + ($C_to_T_infile) = biTransformFastAFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + } + else{ + ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; + } + + ### Creating 4 different bowtie filehandles and storing the first entry + if ($bowtie2){ + single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile); + } + } + + ## FastQ format + else{ + warn "Input file is in FastQ format\n"; + if ($directional){ + ($C_to_T_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + } + elsif($pbat){ + ($G_to_A_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files + } + else{ + ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; + } + + ### Creating up to 4 different bowtie filehandles and storing the first entry + if ($pbat){ + if ($bowtie2){ # as of version 0.10.2 we also support PBAT alignments for Bowtie 2 + single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 (undef,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile); + } + } + elsif ($bowtie2){ + single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile); + } + } + + start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile,$pid); + + } + + ### MERGING AND DELETING TEMP FILES // TIDYING UP AFTER A MULTICORE PROCESS + + if ($pid){ # only performing this for the parent process + + if ($multicore > 1){ + + warn "Now waiting for all child processes to complete\n"; + + ### we need to ensure that we wait for all child processes to be finished before continuing + # warn "here are the child IDs: @pids\n"; + # warn "Looping through the child process IDs:\n"; + + foreach my $id (@pids){ + # print "$id\t"; + my $kid = waitpid ($id,0); + # print "Returned: $kid\nExit status: $?\n"; + unless ($? == 0){ + warn "\nChild process terminated with exit signal: '$?'\n\n"; + } + } + + # regenerating names for temporary files + my @temp_input; + my @temp_output; + my @temp_reports; + my @temp_unmapped_1; # will store single end reads or R1 of paired-end + my @temp_unmapped_2; + my @temp_ambiguous_1; # will store single end reads or R1 of paired-end + my @temp_ambiguous_2; + my @temp_ambig_bam; + + for (1..$offset){ + + # Temp Input Files + if ($single_end){ + if ($gzip){ + push @temp_input, "${original_filename}.temp.${_}.gz"; + } + else{ + push @temp_input, "${original_filename}.temp.${_}"; + } + + } + elsif($paired_end){ + if ($gzip){ + push @temp_input, "${original_filename_1}.temp.${_}.gz"; + push @temp_input, "${original_filename_2}.temp.${_}.gz"; + } + else{ + push @temp_input, "${original_filename_1}.temp.${_}"; + push @temp_input, "${original_filename_2}.temp.${_}"; + } + } + + # if files had a prefix we need to specify it + my $add_prefix; + if (defined $prefix){ + $add_prefix = "${prefix}."; + } + else{ + $add_prefix = ''; + } + + # Temp Output Files + if ($single_end){ + + if ($bowtie2){ + if ($gzip){ + push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2_SE_report.txt"; + push @temp_ambig_bam, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2.ambig.bam"; # only for Bowtie 2 + } + else{ + push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2_SE_report.txt"; + push @temp_ambig_bam, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2.ambig.bam"; # only for Bowtie 2 + } + } + else{ + if ($gzip){ + push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_SE_report.txt"; + } + else{ + push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_SE_report.txt"; + } + } + + if ($unmapped){ + if ($gzip){ + push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_unmapped_reads.fq"; + } + else{ + push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_unmapped_reads.fq"; + } + } + + if ($ambiguous){ + if ($gzip){ + push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_ambiguous_reads.fq"; + } + else{ + push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_ambiguous_reads.fq"; + } + } + + } + elsif($paired_end){ + if ($bowtie2){ + if ($gzip){ + push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_pe.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_PE_report.txt"; + push @temp_ambig_bam, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_pe.ambig.bam"; # only for Bowtie 2 + } + else{ + push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_pe.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_PE_report.txt"; + push @temp_ambig_bam, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_pe.ambig.bam"; # only for Bowtie 2 + } + } + else{ + if ($gzip){ + push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_pe.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_PE_report.txt"; + } + else{ + push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_pe.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_PE_report.txt"; + } + } + + if ($unmapped){ + if ($gzip){ + push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_unmapped_reads_1.fq"; + push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_unmapped_reads_2.fq"; + } + else{ + push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_unmapped_reads_1.fq"; + push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_unmapped_reads_2.fq"; + } + } + + if ($ambiguous){ + if ($gzip){ + push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_ambiguous_reads_1.fq"; + push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_ambiguous_reads_2.fq"; + } + else{ + push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_ambiguous_reads_1.fq"; + push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_ambiguous_reads_2.fq"; + } + } + + } + } + + warn "\n\nRight, cleaning up now...\n\n"; + + # deleting temp files; + warn "Deleting temporary sequence files...\n"; + foreach my $temp (@temp_input){ + #print "$temp\t"; + $temp =~ s/.*\///; # deleting path information + print "${temp_dir}${temp}\t"; + unlink "${temp_dir}${temp}" or warn "Failed to delete temporary FastQ file ${temp_dir}$temp: $!\n"; + } + print "\n\n"; + + # merging temp BAM files + if ($single_end){ + merge_individual_BAM_files(\@temp_output,$original_filename,$single_end); + } + else{ + merge_individual_BAM_files(\@temp_output,$original_filename_1,$single_end); + } + + # deleting temp BAM files + warn "Deleting temporary BAM files...\n"; + foreach my $temp (@temp_output){ + # print "$temp\t"; + $temp =~ s/.*\///; # deleting path information + print "${output_dir}${temp}\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary BAM file ${output_dir}${temp}: $!\n"; + } + print "\n\n"; + + ### AMBIGUOUS BAM files + if ($ambig_bam){ + + # merging temp AMBIG BAM files + if ($single_end){ + merge_individual_ambig_BAM_files(\@temp_ambig_bam,$original_filename,$single_end); + } + else{ + merge_individual_ambig_BAM_files(\@temp_ambig_bam,$original_filename_1,$single_end); + } + + # deleting temp BAM files + warn "Deleting temporary ambiguous BAM files...\n"; + foreach my $temp (@temp_ambig_bam){ + # print "$temp\t"; + $temp =~ s/.*\///; # deleting path information + print "${output_dir}${temp}\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous BAM file ${output_dir}${temp}: $!\n"; + } + print "\n\n"; + } + + if ($unmapped){ + if ($single_end){ + merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename,$single_end); + } + else{ + merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename_1,$single_end,'_1'); + merge_individual_unmapped_files(\@temp_unmapped_2,$original_filename_2,$single_end,'_2'); + } + + # deleting temp unmapped files + warn "Deleting temporary unmapped files...\n"; + foreach my $temp (@temp_unmapped_1){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n"; + } + if ($paired_end){ + foreach my $temp (@temp_unmapped_2){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n"; + } + } + print "\n\n"; + + } + + if ($ambiguous){ + if ($single_end){ + merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename,$single_end); + } + else{ + merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename_1,$single_end,'_1'); + merge_individual_ambiguous_files(\@temp_ambiguous_2,$original_filename_2,$single_end,'_2'); + } + + # deleting temp ambiguous files + warn "Deleting temporary ambiguous files...\n"; + foreach my $temp (@temp_ambiguous_1){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n"; + } + + if ($paired_end){ + foreach my $temp (@temp_ambiguous_2){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n"; + } + } + print "\n\n"; + } + + # resetting the counters once more so we can add all data from all temporary reports + reset_counters_and_fhs($original_filename); + + ### Merging the Bismark mapping report files + if ($single_end){ + merge_individual_mapping_reports(\@temp_reports,$original_filename,$single_end); + print_final_analysis_report_single_end('mock_file1','mock_file_2','mock_pid','mergeThis'); + } + else{ + merge_individual_mapping_reports(\@temp_reports,$original_filename_1,$single_end,$original_filename_2); + print_final_analysis_report_paired_ends('mock_file1','mock_file_2','mock_file3','mock_file_4','mock_pid','mergeThis'); + } + + # deleting temp report files + warn "Deleting temporary report files...\n"; + foreach my $temp (@temp_reports){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary report file $output_dir$temp: $!\n"; + } + print "\n\n"; + + } + + } + + if ($pid){ # only for the Parent + warn "\n====================\nBismark run complete\n====================\n\n"; + + if ($nucleotide_coverage){ + warn "Now calculating observed and expected nucleotide coverage statistics... \n\n"; + if ($final_output_filename =~ /(bam|cram)|/){ + my @args; + push @args, "--genome $genome_folder"; + push @args, "--dir '$output_dir'"; + push @args, "--samtools_path $samtools_path"; + push @args, $final_output_filename; + print "@args","\n"; sleep(3); + + system ("$Bin/bam2nuc @args"); + warn "Finished bam2nuc calculation ...\n\n"; + + } + else{ + warn "Nucleotide coverage statistics are currently only available for BAM or CRAM files\n\n"; + } + } + + } + +} + +sub merge_individual_mapping_reports{ + + my ($temp_reports,$original_filename_1,$single_end,$original_filename_2) = @_; + my $report_file = $original_filename_1; + $report_file =~ s/.*\///; # removing path information + $report_file =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter + + if ($prefix){ + $report_file = "${prefix}.${report_file}"; + } + + if ($basename){ # Output file basename is set using the -B argument + $report_file = ${basename}; + } + + if ($single_end){ + if ($bowtie2){ + $report_file .= '_bismark_bt2_SE_report.txt'; + } + else{ + $report_file .= '_bismark_SE_report.txt'; + } + } + else{ + if ($bowtie2){ + $report_file .= '_bismark_bt2_PE_report.txt'; + } + else{ + $report_file .= '_bismark_PE_report.txt'; + } + } + warn "Writing report to ${output_dir}${report_file}\n"; + open (REPORT,'>',"$output_dir$report_file") or die "Failed to write to ${output_dir}${report_file}: $!\n"; + + foreach my $temp(@$temp_reports){ + $temp =~ s/.*\///; # removing path information + } + + warn "Now merging temporary reports @$temp_reports into >>> ${output_dir}${report_file} <<<\n"; + + if ($single_end){ + print REPORT "Bismark report for: $original_filename_1 (version: $bismark_version)\n"; + } + else{ # paired-end + print REPORT "Bismark report for: $original_filename_1 and $original_filename_2 (version: $bismark_version)\n"; + } + + + my $first = 0; + + foreach my $temp(@$temp_reports){ + # $temp =~ s/.*\///; # removing path information + + warn "Merging from file >> $temp <<\n"; + open (IN,"${output_dir}${temp}") or die "Failed to read from temporary mapping report '${output_dir}${temp}'\n"; + + ### this is printing the first couple of lines + while (){ + chomp; + if ($_ =~ /^Bismark report/){ + next; + } + + unless ($first){ # only happens for the first run we are processing + if ($_ =~ /^Final Alignment/){ + ++$first; + last; + } + else{ + print REPORT "$_\n"; + } + } + } + close IN or warn "Failed to close filehandle\n"; + + ### Simon says: You are going to regret this in the future. Just for the record. He might be right... + read_alignment_report($temp,$single_end); + + } + warn "\n"; + +} + +sub read_alignment_report{ + my ($report,$single_end) = @_; + + my $unique; + my $no_aln; + my $multiple; + my $no_genomic; + my $total_seqs; + my $bismark_version; + my $input_filename; + + my $unique_text; + my $no_aln_text; + my $multiple_text; + my $total_seq_text; + + my $total_C_count; + my ($meth_CpG,$meth_CHG,$meth_CHH,$meth_unknown); + my ($unmeth_CpG,$unmeth_CHG,$unmeth_CHH,$unmeth_unknown); + + my $number_OT; + my $number_CTOT; + my $number_CTOB; + my $number_OB; + + open (ALN,"${output_dir}${report}") or die "Failed to read from temporary mapping report '$output_dir$report'\n"; + + while (){ + chomp; + + ### General Alignment stats + if ($_ =~ /^Sequence pairs analysed in total:/ ){ ## Paired-end + (undef,$total_seqs) = split /\t/; + # warn "Total paired seqs: >> $total_seqs <<\n"; + } + elsif ($_ =~ /^Sequences analysed in total:/ ){ ## Single-end + (undef,$total_seqs) = split /\t/; + # warn "total single-end seqs >> $total_seqs <<\n"; + } + + elsif($_ =~ /^Number of paired-end alignments with a unique best hit:/){ ## Paired-end + (undef,$unique) = split /\t/; + # warn "Unique PE>> $unique <<\n"; + } + elsif($_ =~ /^Number of alignments with a unique best hit from/){ ## Single-end + (undef,$unique) = split /\t/; + # warn "Unique SE>> $unique <<\n"; + } + + elsif($_ =~ /^Sequence pairs with no alignments under any condition:/){ ## Paired-end + (undef,$no_aln) = split /\t/; + # warn "No alignment PE >> $no_aln <<\n"; + } + elsif($_ =~ /^Sequences with no alignments under any condition:/){ ## Single-end + (undef,$no_aln) = split /\t/; + # warn "No alignments SE>> $no_aln <<\n"; + } + + elsif($_ =~ /^Sequence pairs did not map uniquely:/){ ## Paired-end + (undef,$multiple) = split /\t/; + # warn "Multiple alignments PE >> $multiple <<\n"; + } + elsif($_ =~ /^Sequences did not map uniquely:/){ ## Single-end + (undef,$multiple) = split /\t/; + # warn "Multiple alignments SE >> $multiple <<\n"; + } + + elsif($_ =~ /^Sequence pairs which were discarded because genomic sequence could not be extracted:/){ ## Paired-end + (undef,$no_genomic) = split /\t/; + # warn "No genomic sequence PE >> $no_genomic <<\n"; + } + elsif($_ =~ /^Sequences which were discarded because genomic sequence could not be extracted:/){ ## Single-end + (undef,$no_genomic) = split /\t/; + # warn "No genomic sequence SE>> $no_genomic <<\n"; + } + + ### Context Methylation + elsif($_ =~ /^Total number of C/ ){ + (undef,$total_C_count) = split /\t/; + # warn "Total number C >> $total_C_count <<\n"; + } + + elsif($_ =~ /^Total methylated C\'s in CpG context:/ ){ + (undef,$meth_CpG) = split /\t/; + # warn "meth CpG >> $meth_CpG <<\n" ; + } + elsif($_ =~ /^Total methylated C\'s in CHG context:/ ){ + (undef,$meth_CHG) = split /\t/; + # warn "meth CHG >> $meth_CHG <<\n" ; + } + elsif($_ =~ /^Total methylated C\'s in CHH context:/ ){ + (undef,$meth_CHH) = split /\t/; + # warn "meth CHH >> $meth_CHH <<\n" ; + } + elsif($_ =~ /^Total methylated C\'s in Unknown context:/ ){ + (undef,$meth_unknown) = split /\t/; + # warn "meth Unknown >> $meth_unknown <<\n" ; + } + + elsif($_ =~ /^Total unmethylated C\'s in CpG context:/ or $_ =~ /^Total C to T conversions in CpG context:/){ + (undef,$unmeth_CpG) = split /\t/; + # warn "unmeth CpG >> $unmeth_CpG <<\n" ; + } + elsif($_ =~ /^Total unmethylated C\'s in CHG context:/ or $_ =~ /^Total C to T conversions in CHG context:/){ + (undef,$unmeth_CHG) = split /\t/; + # warn "unmeth CHG >> $unmeth_CHG <<\n" ; + } + elsif($_ =~ /^Total unmethylated C\'s in CHH context:/ or $_ =~ /^Total C to T conversions in CHH context:/){ + (undef,$unmeth_CHH) = split /\t/; + # warn "unmeth CHH >> $unmeth_CHH <<\n"; + } + elsif($_ =~ /^Total unmethylated C\'s in Unknown context:/ or $_ =~ /^Total C to T conversions in Unknown context:/){ + (undef,$unmeth_unknown) = split /\t/; + # warn "unmeth Unknown >> $unmeth_unknown <<\n" ; + } + + ### Strand Origin + + elsif($_ =~ /^CT\/GA\/CT:/ ){ ## Paired-end + (undef,$number_OT) = split /\t/; + # warn "Number OT PE>> $number_OT <<\n" ; + } + elsif($_ =~ /^CT\/CT:/ ){ ## Single-end + (undef,$number_OT) = split /\t/; + # warn "Number OT SE>> $number_OT <<\n" ; + } + + elsif($_ =~ /^GA\/CT\/CT:/ ){ ## Paired-end + (undef,$number_CTOT) = split /\t/; + # warn "Number CTOT PE >> $number_CTOT <<\n" ; + } + elsif($_ =~ /^GA\/CT:/ ){ ## Single-end + (undef,$number_CTOT) = split /\t/; + # warn "Number CTOT SE >> $number_CTOT <<\n" ; + } + + elsif($_ =~ /^GA\/CT\/GA:/ ){ ## Paired-end + (undef,$number_CTOB) = split /\t/; + # warn "Number CTOB PE >> $number_CTOB <<\n" ; + } + elsif($_ =~ /^GA\/GA:/ ){ ## Single-end + (undef,$number_CTOB) = split /\t/; + # warn "Number CTOB SE >> $number_CTOB <<\n"; + } + + elsif($_ =~ /^CT\/GA\/GA:/ ){ ## Paired-end + (undef,$number_OB) = split /\t/; + # warn "Number OB PE >> $number_OB <<\n"; + } + elsif($_ =~ /^CT\/GA:/ ){ ## Single-end + (undef,$number_OB) = split /\t/; + # warn "Number OB SE >> $number_OB <<\n"; + } + } + + $counting{sequences_count} += $total_seqs; + $counting{unique_best_alignment_count} += $unique; + $counting{no_single_alignment_found} += $no_aln; + $counting{unsuitable_sequence_count} += $multiple; + $counting{genomic_sequence_could_not_be_extracted_count} += $no_genomic; + + $counting{total_meCHH_count} += $meth_CHH; + $counting{total_meCHG_count} += $meth_CHG; + $counting{total_meCpG_count} += $meth_CpG; + if ($bowtie2){ + $counting{total_meC_unknown_count} += $meth_unknown; + } + + $counting{total_unmethylated_CHH_count} += $unmeth_CHH; + $counting{total_unmethylated_CHG_count} += $unmeth_CHG; + $counting{total_unmethylated_CpG_count} += $unmeth_CpG; + if ($bowtie2){ + $counting{total_unmethylated_C_unknown_count} += $unmeth_unknown; + } + + if ($single_end){ + $counting{CT_CT_count} += $number_OT; + $counting{CT_GA_count} += $number_OB; + $counting{GA_CT_count} += $number_CTOT; + $counting{GA_GA_count} += $number_CTOB; + } + else{ + # paired-end + $counting{GA_CT_CT_count} += $number_CTOT; + $counting{CT_GA_CT_count} += $number_OT; + $counting{GA_CT_GA_count} += $number_CTOB; + $counting{CT_GA_GA_count} += $number_OB; + } +} + +sub merge_individual_ambiguous_files{ + + my ($temp_ambiguous,$original_filename,$single_end,$paired_information) = @_; + my $ambiguous_file = $original_filename; + $ambiguous_file =~ s/.*\///; # removing path information + + if ($prefix){ + $ambiguous_file = "${prefix}.${ambiguous_file}"; + } + + if ($single_end){ + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file = "${basename}_ambiguous_reads.fq.gz"; + } + else{ + $ambiguous_file = "${basename}_ambiguous_reads.fa.gz"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file =~ s/$/_ambiguous_reads.fq.gz/; + } + else{ + $ambiguous_file =~ s/$/_ambiguous_reads.fa.gz/; + } + } + } + else{ # paired-end + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fq.gz"; + } + else{ + $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fa.gz"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fq.gz/; + } + else{ + $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fa.gz/; + } + } + } + + foreach my $temp(@$temp_ambiguous){ + $temp =~ s/.*\///; # removing path information + } + + open (AMBIGUOUS,"| gzip -c - > $output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; + warn "Now merging ambiguous sequences @$temp_ambiguous into >>> $output_dir$ambiguous_file <<<\n"; + + foreach my $temp(@$temp_ambiguous){ + warn "Merging from file >> $temp <<\n"; + if ($temp =~ /gz$/){ + open (IN,"gunzip -c ${output_dir}$temp |") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n"; + } + else{ + open (IN,"${output_dir}$temp") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n"; + } + + while (){ + print AMBIGUOUS; + } + close IN or warn "Failed to close filehandle\n"; + } + warn "\n"; + + close AMBIGUOUS or warn "Failed to close output filehandle AMBIGUOUS\n\n"; +} + + +sub merge_individual_unmapped_files{ + + my ($temp_unmapped,$original_filename,$single_end,$paired_information) = @_; + my $unmapped_file = $original_filename; + $unmapped_file =~ s/.*\///; # removing path information + + if ($prefix){ + $unmapped_file = "${prefix}.${unmapped_file}"; + } + + if ($single_end){ + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file = "${basename}_unmapped_reads.fq.gz"; + } + else{ + $unmapped_file = "${basename}_unmapped_reads.fa.gz"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file =~ s/$/_unmapped_reads.fq.gz/; + } + else{ + $unmapped_file =~ s/$/_unmapped_reads.fa.gz/; + } + } + } + else{ # paired-end + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file = "${basename}_unmapped_reads${paired_information}.fq.gz"; + } + else{ + $unmapped_file = "${basename}_unmapped_reads${paired_information}.fa.gz"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fq.gz/; + } + else{ + $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fa.gz/; + } + } + } + + foreach my $temp(@$temp_unmapped){ + $temp =~ s/.*\///; # removing path information + } + + open (UNMAPPED,"| gzip -c - > ${output_dir}${unmapped_file}") or die "Failed to write to ${output_dir}${unmapped_file}: $!\n"; + warn "Now merging unmapped sequences @$temp_unmapped into >>> ${output_dir}${unmapped_file} <<<\n"; + + foreach my $temp(@$temp_unmapped){ + warn "Merging from file >> $temp <<\n"; + if ($temp =~ /gz$/){ + open (IN,"gunzip -c ${output_dir}${temp} |") or die "Failed to read from unmapped temp file '${output_dir}$temp'\n"; + } + else{ + open (IN,"${output_dir}${temp}") or die "Failed to read from unmapped temp file '${output_dir}${temp}'\n"; + } + + while (){ + print UNMAPPED; + } + close IN or warn "Failed to close filehandle\n"; + } + warn "\n"; + + close UNMAPPED or warn "Failed to close output filehandle UNMAPPED\n\n"; +} + + +sub merge_individual_BAM_files{ + + my ($tempbam,$original_filename,$single_end) = @_; + my $merged_name = $original_filename; + + #warn "merged name is: $merged_name\n"; + $merged_name =~ s/.*\///; # deleting path information + # warn "merged name is: $merged_name\n"; + $merged_name =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter + # warn "merged name is: $merged_name\n"; sleep(5); + + foreach my $temp_bam(@$tempbam){ + $temp_bam =~ s/.*\///; # deleting path information + } + + if ($prefix){ + $merged_name = "$prefix.$merged_name"; + } + + if ($single_end){ + if ($bowtie2){ # BAM format is the default for Bowtie 2 + $merged_name .= '_bismark_bt2.bam'; + } + else{ # BAM is the default output + $merged_name .= '_bismark.bam'; + } + + if ($basename){ # Output file basename is set using the -B argument + $merged_name = "${basename}.bam"; + } + } + else{ + if ($bowtie2){ # BAM format is the default for Bowtie 2 + $merged_name .= '_bismark_bt2_pe.bam'; + } + else{ # BAM is the default output + $merged_name .= '_bismark_pe.bam'; + } + + if ($basename){ # Output file basename is set using the -B argument + $merged_name = "${basename}_pe.bam"; + } + } + + + if ($cram){ + $merged_name =~ s/bam$/cram/; + warn "At this stage we write out a single CRAM file and delete all temporary BAM files\n"; + warn "Now merging BAM files @$tempbam into >>> $merged_name <<<\n"; + $final_output_filename = "${output_dir}${merged_name}"; + + open (OUT,"| $samtools_path view -h -C -T $cram_ref 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to CRAM file $merged_name: $!\nPlease note that this option requires Samtools version 1.2 or higher!\n\n"; + } + else{ + $final_output_filename = "${output_dir}${merged_name}"; + warn "Now merging BAM files @$tempbam into >>> $merged_name <<<\n"; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to $merged_name: $!\n"; + } + + my $first = 0; + + foreach my $temp_bam(@$tempbam){ + # $temp_bam =~ s/.*\///; # deleting path information + + warn "Merging from file >> $temp_bam <<\n"; + + if ($first > 0){ + open (IN,"$samtools_path view ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; + } + else{ # only for the first file we print the header as well + open (IN,"$samtools_path view -h ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; + } + + while (){ + print OUT; + } + close IN or warn "Failed to close filehandle\n"; + ++$first; + } + warn "\n"; + + close OUT or warn "Failed to close output filehandle\n\n"; + +} + + +sub merge_individual_ambig_BAM_files{ + + my ($tempbam,$original_filename,$single_end) = @_; + my $merged_name = $original_filename; + + # warn "merged name is: $merged_name\n"; + $merged_name =~ s/.*\///; # deleting path information + # warn "merged name is: $merged_name\n"; sleep(1); + + foreach my $temp_bam(@$tempbam){ + $temp_bam =~ s/.*\///; # deleting path information + } + + if ($prefix){ + $merged_name = "$prefix.$merged_name"; + } + + if ($single_end){ + if ($bowtie2){ # BAM format is the default for Bowtie 2 + $merged_name .= '_bismark_bt2.ambig.bam'; + } + + if ($basename){ # Output file basename is set using the -B argument + $merged_name = "${basename}.ambig.bam"; + } + } + else{ + if ($bowtie2){ # BAM format is the default for Bowtie 2 + $merged_name .= '_bismark_bt2_pe.ambig.bam'; + } + + if ($basename){ # Output file basename is set using the -B argument + $merged_name = "${basename}_pe.ambig.bam"; + } + } + + warn "Now merging ambiguous BAM files @$tempbam into >>> $merged_name <<<\n"; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to $merged_name: $!\n"; + my $first = 0; + + foreach my $temp_bam(@$tempbam){ + # $temp_bam =~ s/.*\///; # deleting path information + + warn "Merging from file >> $temp_bam <<\n"; + + if ($first > 0){ + open (IN,"$samtools_path view ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; + } + else{ # only for the first file we print the header as well + open (IN,"$samtools_path view -h ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; + } + + while (){ + print OUT; + } + close IN or warn "Failed to close filehandle\n"; + ++$first; + } + warn "\n"; + + close OUT or warn "Failed to close output filehandle\n\n"; +} + +sub start_methylation_call_procedure_single_ends { + my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; + my ($dir,$filename); + + if ($sequence_file =~ /\//){ + ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $sequence_file; + } + + ### printing all alignments to a results file + my $outfile = $filename; + # warn "Outfile: $outfile\n"; + $outfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter + # warn "Outfile: $outfile\n";sleep(5); + + if ($prefix){ + $outfile = "$prefix.$outfile"; + } + if ($bowtie2){ # SAM format is the default for Bowtie 2 + $outfile =~ s/$/_bismark_bt2.sam/; + } + elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) + $outfile =~ s/$/_bismark.txt/; + } + else{ # SAM is the default output + $outfile =~ s/$/_bismark.sam/; + } + + if ($basename){ # Output file basename is set using the -B argument + $outfile = "${basename}.sam"; + } + + $bam = 0 unless (defined $bam); + + if ($ambig_bam){ + my $ambig_bam_out = $outfile; + $ambig_bam_out =~ s/sam$/ambig.bam/; + warn "Ambiguous BAM output: $ambig_bam_out\n"; + open (AMBIBAM,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$ambig_bam_out") or die "Failed to write to $ambig_bam_out: $!\n"; + } + + if ($cram){ ### Samtools is installed, writing out CRAM directly. This qill require Samtools version 1.2 or higher! + ### for multicore processing we write out BAM files by default and merge them together as a single CRAM file in the merging step later on. + ### This avoids having to change all the the file endings on the way + if($multicore > 1){ + $outfile =~ s/sam$/bam/; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ # single-core mode + $outfile =~ s/sam$/cram/; + $final_output_filename = "${output_dir}${outfile}"; + open (OUT,"| $samtools_path view -h -C -T $cram_ref 2>/dev/null - > $output_dir$outfile") or die "Failed to write to CRAM file $outfile: $!\nPlease note that this option requires Samtools version 1.2 or higher!\n\n"; + } + } + elsif($bam == 1){ ### Samtools is installed, writing out BAM directly + $outfile =~ s/sam$/bam/; + $final_output_filename = "${output_dir}${outfile}"; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead + $outfile .= '.gz'; + open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ # uncompressed ouput, default + open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n"; + + + sleep(1); + + if ($vanilla){ + print OUT "Bismark version: $bismark_version\n"; + } + + ### printing alignment and methylation call summary to a report file + my $reportfile = $filename; + $reportfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter + + if ($prefix){ + $reportfile = "$prefix.$reportfile"; + } + if ($bowtie2){ + $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; + } + else{ + $reportfile =~ s/$/_bismark_SE_report.txt/; + } + + if ($basename){ # Output file basename is set using the -B argument + $reportfile = "${basename}_SE_report.txt"; + } + + open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; + print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; + + if ($unmapped){ + my $unmapped_file = $filename; + if ($prefix){ + $unmapped_file = "$prefix.$unmapped_file"; + } + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file = "${basename}_unmapped_reads.fq"; + } + else{ + $unmapped_file = "${basename}_unmapped_reads.fa"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file =~ s/$/_unmapped_reads.fq/; + } + else{ + $unmapped_file =~ s/$/_unmapped_reads.fa/; + } + } + + if ($multicore > 1){ # multicore runs already output gzipped unmapped files + open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; + } + else{ + $unmapped_file .= '.gz'; + open (UNMAPPED,"| gzip -c - > $output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; + } + warn "Unmapped sequences will be written to $output_dir$unmapped_file\n"; + } + + if ($ambiguous){ + my $ambiguous_file = $filename; + + if ($prefix){ + $ambiguous_file = "$prefix.$ambiguous_file"; + } + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file = "${basename}_ambiguous_reads.fq"; + } + else{ + $ambiguous_file = "${basename}_ambiguous_reads.fa"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file =~ s/$/_ambiguous_reads.fq/; + } + else{ + $ambiguous_file =~ s/$/_ambiguous_reads.fa/; + } + } + + if ($multicore > 1){ # multicore runs already output gzipped amobiguous files + open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; + } + else{ + $ambiguous_file .= '.gz'; + open (AMBIG,"| gzip -c - > $output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; + } + warn "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; + } + + if ($directional){ + print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n"; + } + elsif ($pbat){ + print REPORT "Option '--pbat' specified: alignments to original strands (OT and OB) strands were ignored (i.e. not performed)\n"; + } + else{ + print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n"; + } + + if ($bowtie2){ + print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + + unless ($vanilla or $sam_no_hd){ + generate_SAM_header(); + } + + ### Input file is in FastA format + if ($sequence_file_format eq 'FASTA'){ + process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid); + } + ### Input file is in FastQ format + else{ + process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid); + } +} + +sub start_methylation_call_procedure_paired_ends { + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; + my ($dir_1,$filename_1); + + if ($sequence_file_1 =~ /\//){ + ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/; + } + else{ + $filename_1 = $sequence_file_1; + } + + my ($dir_2,$filename_2); + + if ($sequence_file_2 =~ /\//){ + ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/; + } + else{ + $filename_2 = $sequence_file_2; + } + + ### printing all alignments to a results file + my $outfile = $filename_1; + # warn "Outfile: $outfile\n"; + $outfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter + # warn "Outfile: $outfile\n";sleep(5); + + if ($prefix){ + $outfile = "$prefix.$outfile"; + } + if ($bowtie2){ # SAM format is the default Bowtie 2 output + $outfile =~ s/$/_bismark_bt2_pe.sam/; + } + elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) + $outfile =~ s/$/_bismark_pe.txt/; + } + else{ # SAM format is the default Bowtie 1 output + $outfile =~ s/$/_bismark_pe.sam/; + } + + + if ($basename){ # Output file basename is set using the -B argument + $outfile = "${basename}_pe.sam"; + } + + if ($ambig_bam){ + my $ambig_bam_out = $outfile; + $ambig_bam_out =~ s/sam$/ambig.bam/; + warn "Ambiguous BAM output: $ambig_bam_out\n"; + open (AMBIBAM,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$ambig_bam_out") or die "Failed to write to $ambig_bam_out: $!\n"; + } + + $bam = 0 unless (defined $bam); + + if ($cram){ ### Samtools is installed, writing out CRAM directly. This qill require Samtools version 1.2 or higher! + if ($multicore > 1){ + $outfile =~ s/sam$/bam/; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ # single-core mode + $outfile =~ s/sam$/cram/; + $final_output_filename = "${output_dir}${outfile}"; + open (OUT,"| $samtools_path view -h -C -T $cram_ref 2>/dev/null - > $output_dir$outfile") or die "Failed to write to CRAM file $outfile: $!\nPlease note that this option requires Samtools version 1.2 or higher!\n\n"; + } + } + elsif ($bam == 1){ ### Samtools is installed, writing out BAM directly + $outfile =~ s/sam$/bam/; + $final_output_filename = "${output_dir}${outfile}"; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead + $outfile .= '.gz'; + open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ # uncompressed ouput, default + open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + + warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n"; + sleep(1); + + if ($vanilla){ + print OUT "Bismark version: $bismark_version\n"; + } + + ### printing alignment and methylation call summary to a report file + my $reportfile = $filename_1; + $reportfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter + + if ($prefix){ + $reportfile = "$prefix.$reportfile"; + } + + if ($bowtie2){ + $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; + } + else{ + $reportfile =~ s/$/_bismark_PE_report.txt/; + } + + if ($basename){ # Output file basename is set using the -B argument + $reportfile = "${basename}_PE_report.txt"; + } + + open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; + print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n"; + + if ($bowtie2){ + print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n"; + } + else{ + print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n"; + } + + + ### Unmapped read output + if ($unmapped){ + my $unmapped_1 = $filename_1; + my $unmapped_2 = $filename_2; + + if ($prefix){ + $unmapped_1 = "$prefix.$unmapped_1"; + $unmapped_2 = "$prefix.$unmapped_2"; + } + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_1 = "${basename}_unmapped_reads_1.fq"; + $unmapped_2 = "${basename}_unmapped_reads_2.fq"; + } + else{ + $unmapped_1 = "${basename}_unmapped_reads_1.fa"; + $unmapped_2 = "${basename}_unmapped_reads_2.fa"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_1 =~ s/$/_unmapped_reads_1.fq/; + $unmapped_2 =~ s/$/_unmapped_reads_2.fq/; + } + else{ + $unmapped_1 =~ s/$/_unmapped_reads_1.fa/; + $unmapped_2 =~ s/$/_unmapped_reads_2.fa/; + } + } + + if ($multicore > 1){ # unmapped files are merged into .gz files in multicore runs anyway + open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; + open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; + } + else{ + $unmapped_1 .= '.gz'; + $unmapped_2 .= '.gz'; + open (UNMAPPED_1,"| gzip -c - > $output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; + open (UNMAPPED_2,"| gzip -c - > $output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; + } + warn "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; + } + + if ($ambiguous){ + my $amb_1 = $filename_1; + my $amb_2 = $filename_2; + + if ($prefix){ + $amb_1 = "$prefix.$amb_1"; + $amb_2 = "$prefix.$amb_2"; + } + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $amb_1 = "${basename}_ambiguous_reads_1.fq"; + $amb_2 = "${basename}_ambiguous_reads_2.fq"; + } + else{ + $amb_1 = "${basename}_ambiguous_reads_1.fa"; + $amb_2 = "${basename}_ambiguous_reads_2.fa"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $amb_1 =~ s/$/_ambiguous_reads_1.fq/; + $amb_2 =~ s/$/_ambiguous_reads_2.fq/; + } + else{ + $amb_1 =~ s/$/_ambiguous_reads_1.fa/; + $amb_2 =~ s/$/_ambiguous_reads_2.fa/; + } + } + + if ($multicore > 1){ # ambiguous files are merged into .gz files in multicore runs anyway + open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; + open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; + } + else{ + $amb_1 .= '.gz'; + $amb_2 .= '.gz'; + open (AMBIG_1,"| gzip -c - > $output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; + open (AMBIG_2,"| gzip -c - > $output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; + } + warn "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; + } + + if ($directional){ + print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n\n"; + } + elsif ($pbat){ + print REPORT "Option '--pbat' specified: alignments to original strands (OT, OB) were ignored (i.e. not performed)\n\n"; + } + else{ + print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n\n"; + } + + + + unless ($vanilla or $sam_no_hd){ + generate_SAM_header(); + } + + ### Input files are in FastA format + if ($sequence_file_format eq 'FASTA'){ + process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + } + ### Input files are in FastQ format + else{ + process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + } +} + +sub print_final_analysis_report_single_end{ + my ($C_to_T_infile,$G_to_A_infile,$pid,$merge_multi) = @_; + + if ($merge_multi){ + warn "Printing a final merged alignment report for all individual sub-reports\n\n"; + } + else{ + ### All sequences from the original sequence file have been analysed now + ### deleting temporary C->T or G->A infiles + + if ($directional){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n"; + } + else{ + warn "Could not delete temporary file $C_to_T_infile properly $!\n"; + } + } + elsif ($pbat){ + my $deletion_successful = unlink "$temp_dir$G_to_A_infile"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n"; + } + else{ + warn "Could not delete temporary file $G_to_A_infile properly $!\n"; + } + } + else{ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n"; + } + else{ + warn "Could not delete temporary files properly $!\n"; + } + } + } + + ### printing a final report for the alignment procedure + print REPORT "Final Alignment report\n",'='x22,"\n"; + warn "Final Alignment report\n",'='x22,"\n"; + # foreach my $index (0..$#fhs){ + # print "$fhs[$index]->{name}\n"; + # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; + # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; + # } + + ### printing a final report for the methylation call procedure + warn "Sequences analysed in total:\t$counting{sequences_count}\n"; + print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n"; + my $percent_alignable_sequences; + + if ($counting{sequences_count} == 0){ + $percent_alignable_sequences = 0; + } + else{ + $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); + } + + warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n"; + print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n"; + + ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads), + ### only calculating the percentage if there were any overruled alignments + if ($counting{low_complexity_alignments_overruled_count}){ + my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count}); + # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n"; + } + + print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; + print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; + + print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; + print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; + + if ($directional){ + print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + } + + ### detailed information about Cs analysed + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + warn "\n"; + + warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + warn "\n"; + + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + print REPORT "\n"; + + print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + print REPORT "\n"; + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meC_unknown; + if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ + $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); + } + + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### printing methylated C percentage (CHG context) if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated C percentage (CHH context) if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + } + + ### printing methylated C percentage (Unknown C context) if applicable + if ($bowtie2){ + if ($percent_meC_unknown){ + warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; + } + } + print REPORT "\n\n"; + warn "\n\n"; + + if ($seqID_contains_tabs){ + warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; + print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; + } +} + + +sub print_final_analysis_report_paired_ends{ + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid,$merge_multi) = @_; + + if ($merge_multi){ + warn "Printing a final merged alignment report for all individual sub-reports\n\n"; + } + else{ + ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles + if ($directional){ + if ($G_to_A_infile_2){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n"; + } + else{ + warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n"; + } + } + else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n"; + } + else{ + warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n"; + } + } + } + else{ + if ($G_to_A_infile_2 and $C_to_T_infile_2){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2"; + if ($deletion_successful == 4){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n"; + } + else{ + warn "Could not delete temporary files properly: $!\n"; + } + } + else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n"; + } + else{ + warn "Could not delete temporary files properly: $!\n"; + } + } + } + } + + ### printing a final report for the alignment procedure + warn "Final Alignment report\n",'='x22,"\n"; + print REPORT "Final Alignment report\n",'='x22,"\n"; + # foreach my $index (0..$#fhs){ + # print "$fhs[$index]->{name}\n"; + # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; + # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; + # } + + ### printing a final report for the methylation call procedure + warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; + print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; + + my $percent_alignable_sequence_pairs; + if ($counting{sequences_count} == 0){ + $percent_alignable_sequence_pairs = 0; + } + else{ + $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); + } + print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n"; + print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n"; + + print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; + print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; + + + print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; + print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; + ### detailed information about Cs analysed + + if ($directional){ + print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + } + + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + warn "\n"; + + warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + warn "\n"; + + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; + } + print REPORT "\n"; + + print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; + } + print REPORT "\n"; + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meC_unknown; + if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ + $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); + } + + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### printing methylated C percentage in CHG context if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated C percentage in CHH context if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + } + + ### printing methylated C percentage (Unknown C context) if applicable + if ($bowtie2){ + if ($percent_meC_unknown){ + warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; + } + } + print REPORT "\n\n"; + warn "\n\n"; + +} + +sub process_single_end_fastA_file_for_methylation_call{ + my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; + ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. + ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either + ### the C->T or G->A version + + ### gzipped version of the infile + if ($sequence_file =~ /\.gz$/){ + open (IN,"gunzip -c $sequence_file |") or die $!; + } + else{ + open (IN,$sequence_file) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence file $sequence_file\n"; + while (1) { + # last if ($counting{sequences_count} > 100); + my $identifier = ; + my $sequence = ; + last unless ($identifier and $sequence); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + chomp $sequence; + chomp $identifier; + + $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier); + } + else{ + $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1 + } + + unless ($return){ + $return = 0; + } + + # print the sequence to ambiguous.out if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG ">$identifier\n"; + print AMBIG "$sequence\n"; + } + + # print the sequence to file if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED ">$identifier\n"; + print UNMAPPED "$sequence\n"; + } + } + print "Processed $counting{sequences_count} sequences in total\n\n"; + + close OUT or warn "Failed to close filehandle OUT: $!\n"; + + print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid); + +} + +sub process_single_end_fastQ_file_for_methylation_call{ + + my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; + + ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. + ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either + ### the C->T or G->A version + + ### gzipped version of the infile + if ($sequence_file =~ /\.gz$/){ + open (IN,"gunzip -c $sequence_file |") or die $!; + } + else{ + open (IN,$sequence_file) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence file $sequence_file\n"; + while (1) { + my $identifier = ; + my $sequence = ; + my $identifier_2 = ; + my $quality_value = ; + last unless ($identifier and $sequence and $identifier_2 and $quality_value); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + chomp $sequence; + chomp $identifier; + chomp $quality_value; + + $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value); + } + else{ + $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1 + } + + unless ($return){ + $return = 0; + } + + # print the sequence to ambiguous.out if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG "\@$identifier\n"; + print AMBIG "$sequence\n"; + print AMBIG $identifier_2; + print AMBIG "$quality_value\n"; + } + + # print the sequence to file if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED "\@$identifier\n"; + print UNMAPPED "$sequence\n"; + print UNMAPPED $identifier_2; + print UNMAPPED "$quality_value\n"; + } + } + print "Processed $counting{sequences_count} sequences in total\n\n"; + + close OUT or warn "Failed to close filehandle OUT: $!\n"; + + print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid); + if ($ambig_bam){ + close AMBIBAM or warn "Had trouble closing filehandle AMBIBAM: $!\n"; + } +} + +sub process_fastA_files_for_paired_end_methylation_calls{ + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; + ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to + ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping. + ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the + ### converted genomes (either the C->T or G->A version) + + ### gzipped version of the infiles + if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ + open (IN1,"gunzip -c $sequence_file_1 |") or die "Failed to open gunzip -c pipe to $sequence_file_1 $!\n"; + open (IN2,"gunzip -c $sequence_file_2 |") or die "Failed to open gunzip -c pipe to $sequence_file_2 $!\n"; + } + else{ + open (IN1,$sequence_file_1) or die $!; + open (IN2,$sequence_file_2) or die $!; + } + + warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; + ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one + + my $count = 0; + + while (1) { + # reading from the first input file + my $identifier_1 = ; + my $sequence_1 = ; + # reading from the second input file + my $identifier_2 = ; + my $sequence_2 = ; + last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2); + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + $identifier_2 = fix_IDs($identifier_2); + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequence pairs so far\n"; + } + my $orig_identifier_1 = $identifier_1; + my $orig_identifier_2 = $identifier_2; + + chomp $sequence_1; + chomp $identifier_1; + chomp $sequence_2; + chomp $identifier_2; + + $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1); + } + else{ + $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1); + } + + unless ($return){ + $return = 0; + } + + # print the sequences to ambiguous_1 and _2 if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG_1 $orig_identifier_1; + print AMBIG_1 "$sequence_1\n"; + print AMBIG_2 $orig_identifier_2; + print AMBIG_2 "$sequence_2\n"; + } + + # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED_1 $orig_identifier_1; + print UNMAPPED_1 "$sequence_1\n"; + print UNMAPPED_2 $orig_identifier_2; + print UNMAPPED_2 "$sequence_2\n"; + } + } + + warn "Processed $counting{sequences_count} sequences in total\n\n"; + + close OUT or die $!; + + print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + +} + +sub process_fastQ_files_for_paired_end_methylation_calls{ + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; + ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to + ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments. + ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both) + ### of the converted genomes (either C->T or G->A version) + + ### gzipped version of the infiles + if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ + open (IN1,"gunzip -c $sequence_file_1 |") or die "Failed to open gunzip -c pipe to $sequence_file_1 $!\n"; + open (IN2,"gunzip -c $sequence_file_2 |") or die "Failed to open gunzip -c pipe to $sequence_file_2 $!\n"; + } + else{ + open (IN1,$sequence_file_1) or die $!; + open (IN2,$sequence_file_2) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; + ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one + while (1) { + # reading from the first input file + my $identifier_1 = ; + my $sequence_1 = ; + my $ident_1 = ; # not needed + my $quality_value_1 = ; # not needed + # reading from the second input file + my $identifier_2 = ; + my $sequence_2 = ; + my $ident_2 = ; # not needed + my $quality_value_2 = ; # not needed + last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2); + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + $identifier_2 = fix_IDs($identifier_2); + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequence pairs so far\n"; + } + + my $orig_identifier_1 = $identifier_1; + my $orig_identifier_2 = $identifier_2; + + chomp $sequence_1; + chomp $identifier_1; + chomp $sequence_2; + chomp $identifier_2; + chomp $quality_value_1; + chomp $quality_value_2; + + $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); + } + else{ + $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); + } + + unless ($return){ + $return = 0; + } + + # print the sequences to ambiguous_1 and _2 if --ambiguous was specified + if ($ambiguous and $return == 2){ + # seq_1 + print AMBIG_1 $orig_identifier_1; + print AMBIG_1 "$sequence_1\n"; + print AMBIG_1 $ident_1; + print AMBIG_1 "$quality_value_1\n"; + # seq_2 + print AMBIG_2 $orig_identifier_2; + print AMBIG_2 "$sequence_2\n"; + print AMBIG_2 $ident_2; + print AMBIG_2 "$quality_value_2\n"; + } + + # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified + elsif ($unmapped and $return == 1){ + # seq_1 + print UNMAPPED_1 $orig_identifier_1; + print UNMAPPED_1 "$sequence_1\n"; + print UNMAPPED_1 $ident_1; + print UNMAPPED_1 "$quality_value_1\n"; + # seq_2 + print UNMAPPED_2 $orig_identifier_2; + print UNMAPPED_2 "$sequence_2\n"; + print UNMAPPED_2 $ident_2; + print UNMAPPED_2 "$quality_value_2\n"; + } + } + + warn "Processed $counting{sequences_count} sequences in total\n\n"; + + close OUT or warn "Failed to close filehandle OUT: $!\n\n"; + if ($ambig_bam){ + close AMBIBAM or warn "Had trouble closing filehandle AMBIBAM: $!\n\n"; + } + + print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + +} + +sub check_bowtie_results_single_end{ + my ($sequence,$identifier,$quality_value) = @_; + + unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout + $quality_value = 'I'x(length$sequence); + } + + my %mismatches = (); + ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome + foreach my $index (0..$#fhs){ + + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); + ### if the sequence we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + ############################################################### + ### STEP I Now processing the alignment stored in last_line ### + ############################################################### + my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier); + ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation + ### we only continue to extract useful information about this alignment if 1 was returned + if ($valid_alignment_found_1 == 1){ + ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself + ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) + my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; + + unless($mismatch_info){ + $mismatch_info = ''; + } + + chomp $mismatch_info; + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches; + if ($mismatch_info eq ''){ + $number_of_mismatches = 0; + } + elsif ($mismatch_info =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info); + $number_of_mismatches = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n"; + } + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + my $alignment_location = join (":",$chromosome,$position); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ + $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; + $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; + $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; + $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; + } + $number_of_mismatches = undef; + ################################################################################################################################################## + ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a + ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will + ### be returned as $valid_alignment_found and it will then be processed in the next round only. + ################################################################################################################################################## + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else { + # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier); + ### we only continue to extract useful information about this second alignment if 1 was returned + if ($valid_alignment_found_2 == 1){ + ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself + ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) + my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; + unless($mismatch_info){ + $mismatch_info = ''; + } + chomp $mismatch_info; + + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches; + if ($mismatch_info eq ''){ + $number_of_mismatches = 0; + } + elsif ($mismatch_info =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info); + $number_of_mismatches = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + ### extracting the chromosome number from the bowtie output (see above) + my $alignment_location = join (":",$chromosome,$position); + ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position + ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this + ### case we are not writing the same entry out a second time. + unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ + $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; + $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; + $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; + $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; + } + #################################################################################################################################### + #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ### + #################################################################################################################################### + $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + next; + } + else { + # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + ### still within the 2nd sequence in correct orientation found + } + ### still withing the 1st sequence in correct orientation found + } + ### still within the if (last_seq_id eq identifier) condition + } + ### still within foreach index loop + } + ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file + unless(%mismatches){ + $counting{no_single_alignment_found}++; + if ($unmapped){ + return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified + } + else{ + return; + } + } + ####################################################################################################################################################### + ####################################################################################################################################################### + ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### + ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### + ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ### + ####################################################################################################################################################### + ####################################################################################################################################################### + ### Going to use the variable $sequence_fails as a memory if a sequence could not be aligned uniquely (set to 1 then) + my $sequence_fails = 0; + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference! + ### sorting in ascending order + foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ + + ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment + if (scalar keys %{$mismatches{$mismatch_number}} == 1){ + for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position}; + $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; + } + } + elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){ + ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and + ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a + ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome + ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite + ### reaction. E.g. + ### CAGTCACGCGCGCGCG will become + ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition. + ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave + ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!: + ### G->A conversion: + ### highly methylated: CAATCACACACACACA + ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce + ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the + ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted + ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts. + ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions + ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of + ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment. + ### In the above example the number of transliterations required to transform the actual sequence + ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment) + ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments) + ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments + ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed + ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be + ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded. + my @three_candidate_seqs; + foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){ + my $transliterations_performed; + if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){ + $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT'); + } + elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){ + $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA'); + } + else{ + die "unexpected index number range $!\n"; + } + push @three_candidate_seqs,{ + index =>$mismatches{$mismatch_number}->{$composite_location}->{index}, + bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence}, + mismatch_number => $mismatch_number, + chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome}, + position => $mismatches{$mismatch_number}->{$composite_location}->{position}, + seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id}, + transliterations_performed => $transliterations_performed, + }; + } + ### sorting in ascending order for the lowest number of transliterations performed + @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs; + my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed}; + my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed}; + my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed}; + # print "$first_array_element\t$second_array_element\t$third_array_element\n"; + if (($first_array_element*2) < $second_array_element){ + $counting{low_complexity_alignments_overruled_count}++; + ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits + $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position}; + $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; + # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n"; + } + else{ + $sequence_fails = 1; + } + } + else{ + $sequence_fails = 1; + } + ### after processing the alignment with the lowest number of mismatches we exit + last; + } + ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions + if ($sequence_fails == 1){ + $counting{unsuitable_sequence_count}++; + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified + } + if ($unmapped){ + return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it will have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params); + + ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); + + print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value); + return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out +} + +sub check_bowtie_results_single_end_bowtie2{ + my ($sequence,$identifier,$quality_value) = @_; + + unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout + $quality_value = 'I'x(length$sequence); + } + + # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name. + # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs + # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n"; + + my $alignment_ambiguous = 0; + my $first_ambig_alignment; # storing the first ambiguous alignment so it can be written out in case '--ambig_bam' was specified + my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far + my $amb_same_thread = 0; ## if a reads primary and secondary alignments have the same alignment score we set this to true. + + my %alignments = (); + + ### reading from the Bowtie 2 output filehandles + foreach my $index (0..$#fhs){ + # print "Index: $index\n"; + # print "$fhs[$index]->{last_line}\n"; + # print "$fhs[$index]->{last_seq_id}\n"; + # sleep (1); + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); + + ### if the sequence we are currently looking at produced an alignment we are doing various things with it + # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n"; + + if ($fhs[$index]->{last_seq_id} eq $identifier) { + # SAM format specifications for Bowtie 2 + # (1) Name of read that aligned + # (2) Sum of all applicable flags. Flags relevant to Bowtie are: + # 1 The read is one of a pair + # 2 The alignment is one end of a proper paired-end alignment + # 4 The read has no reported alignments + # 8 The read is one of a pair and has no reported alignments + # 16 The alignment is to the reverse reference strand + # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand + # 64 The read is mate 1 in a pair + # 128 The read is mate 2 in a pair + # 256 The read has multiple mapping states + # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) + # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) + # (5) Mapping quality (255 means MAPQ is not available) + # (6) CIGAR string representation of alignment (* if unavailable) + # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. + # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. + # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. + # (10) Read sequence (reverse-complemented if aligned to the reverse strand) + # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. + # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: + # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. + # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. + # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. + # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. + # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. + # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. + # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. + # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. + + my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10]; + + ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance + if ($flag == 4){ + ## reading in the next alignment, which must be the next sequence + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + if ($seq_id eq $identifier){ + die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + next; # next instance + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + } + + # if there are one or more proper alignments we can extract the chromosome number + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + + ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string + my ($alignment_score,$second_best,$MD_tag); + my @fields = split (/\t/,$fhs[$index]->{last_line}); + + foreach (11..$#fields){ + if ($fields[$_] =~ /AS:i:(.*)/){ + $alignment_score = $1; + } + elsif ($fields[$_] =~ /XS:i:(.*)/){ + $second_best = $1; + } + elsif ($fields[$_] =~ /MD:Z:(.*)/){ + $MD_tag = $1; + } + } + + my $overwrite = 0; # If we get 2 alignments to the very same position, e.g. to OT with and AS of -156 and to CTOB with and AS of 0 we need the latter to trump the former, else + # the read will be assigned to the wrong strand which may result in incorrect methylation calls. + # this was brought to our attention by Sylvain Foret (ANU Canberra), 13 April 2016 + + if (!defined $best_AS_so_far){ + $best_AS_so_far = $alignment_score; + $overwrite++; + # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n"; + if ($ambig_bam){ # also setting the first_ambig_alignment + $first_ambig_alignment = $fhs[$index]->{last_line}; + $first_ambig_alignment =~ s/_(CT|GA)_converted//; + # warn "$first_ambig_alignment\n"; sleep(1); + } + } + else{ + if ($alignment_score >= $best_AS_so_far){ # AS are generally negative with a maximum of 0; + # 19 07 2016: changed this to >= so that equally good alignments are also added. Ambiguous alignments from different threads will be identified later on + $best_AS_so_far = $alignment_score; + $overwrite++; + # warn "Found better or equal alignment score ($alignment_score), setting \$best_AS_so_far to $best_AS_so_far\n"; + + # 22 07 2016: resetting the ambiguous score within same thread only if the current alignment is really better than the previous one + if ($alignment_score > $best_AS_so_far){ + # warn "Resetting amb within thread value to 0\n"; + $amb_same_thread = 0; + + if ($ambig_bam){ # also setting a new first_ambig_alignment + $first_ambig_alignment = $fhs[$index]->{last_line}; + $first_ambig_alignment =~ s/_(CT|GA)_converted//; + # warn "$first_ambig_alignment\n"; sleep(1); + } + } + } + else{ + # warn "Current alignment (AS $alignment_score) isn't better than the best so far ($best_AS_so_far). Not changing anything\n"; + } + } + + # warn "First best alignment_score is: '$alignment_score'\n"; + # warn "MD tag is: '$MD_tag'\n"; + die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag) from line $fhs[$index]->{last_line}!\n" unless (defined $alignment_score and defined $MD_tag); + + if (defined $second_best){ + # warn "second best alignment_score is: '$second_best'\n\n"; + + # If the first alignment score is the same as the alignment score of the second best hit we keep a memory of this + if ($alignment_score == $second_best){ + + # checking to see if this read produced the best alignment + if ($alignment_score == $best_AS_so_far){ # yes this read is the best one so far, however it is ambiguous + # warn "Read is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n"; + $amb_same_thread = 1; + } + else{ + # warn "This read has a worse alignments score than the best alignment so far and will be ignored even though it is ambiguous in itself\n"; + } + + ### if there is a better alignment later on -> fine. If not, the read will get booted altogether + + ## need to read and discard all additional ambiguous reads until we reach the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + last; # break free in case we have reached the end of the alignment output + } + } + # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + } + else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment + + my $alignment_location = join (":",$chromosome,$position); + + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB + + if ($overwrite){ + $alignments{$alignment_location}->{seq_id} = $id; + $alignments{$alignment_location}->{alignment_score} = $alignment_score; + $alignments{$alignment_location}->{alignment_score_second_best} = $second_best; + $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome; + $alignments{$alignment_location}->{position} = $position; + $alignments{$alignment_location}->{CIGAR} = $cigar; + $alignments{$alignment_location}->{MD_tag} = $MD_tag; + } + + ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + last; # break free in case we have reached the end of the alignment output + } + } + # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + } + } + else{ # there is no second best hit, so we can just store this one and read in the next sequence + + my $alignment_location = join (":",$chromosome,$position); + # warn "There is no second best hit. Overwrite status: $overwrite\n"; + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB + + if ($overwrite){ + $alignments{$alignment_location}->{seq_id} = $id; + $alignments{$alignment_location}->{alignment_score} = $alignment_score; + $alignments{$alignment_location}->{alignment_score_second_best} = undef; + $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome; + $alignments{$alignment_location}->{position} = $position; + $alignments{$alignment_location}->{MD_tag} = $MD_tag; + $alignments{$alignment_location}->{CIGAR} = $cigar; + } + + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + if ($seq_id eq $identifier){ + die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + } + } + } + } + + ### If there were several equally good alignments for the best alignment score we will boot the read + if ($amb_same_thread){ + $alignment_ambiguous = 1; + # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; + } + else{ + # warn "alignment won't be considered ambiguous. This time....\n"; + } + + ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out. + if ($alignment_ambiguous == 1){ + $counting{unsuitable_sequence_count}++; + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); + # print "$ambiguous_read_output\n"; + + if ($ambig_bam){ + # warn "Sequence is ambiguous, printing out BAM file:\n"; + print AMBIBAM "$first_ambig_alignment\n"; + } + + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; + } + } + + ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file + unless(%alignments){ + $counting{no_single_alignment_found}++; + # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value); + # print "$unmapped_read_output\n"; + if ($unmapped){ + return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified + } + else{ + return 0; # default + } + } + + ####################################################################################################################################################### + + ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one + ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) + ### alignment score we are discarding the sequence altogether. + ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for + ### opening (5) and extending (3 per bp) the gap. + + ####################################################################################################################################################### + + my $methylation_call_params; # hash reference which will store all information we need for the methylation call + my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + + ### print contents of %alignments for debugging + # if (scalar keys %alignments > 1){ + # print "\n******\n"; + # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ + # print "Loc: $alignment_location\n"; + # print "ID: $alignments{$alignment_location}->{seq_id}\n"; + # print "AS: $alignments{$alignment_location}->{alignment_score}\n"; + # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n"; + # print "Index $alignments{$alignment_location}->{index}\n"; + # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; + # print "pos: $alignments{$alignment_location}->{position}\n"; + # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n"; + # } + # print "\n******\n"; + # } + + ### if there is only 1 entry in the hash with we accept it as the best alignment + if (scalar keys %alignments == 1){ + for my $unique_best_alignment (keys %alignments){ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position}; + $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score}; + $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$unique_best_alignment}->{alignment_score_second_best}; + $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag}; + $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR}; + } + } + + ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case + ### we boot the sequence altogether + elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ + my $best_alignment_score; + my $best_alignment_location; + foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){ + # print "$alignments{$alignment_location}->{alignment_score}\n"; + unless (defined $best_alignment_score){ + $best_alignment_score = $alignments{$alignment_location}->{alignment_score}; + $best_alignment_location = $alignment_location; + # print "setting best alignment score: $best_alignment_score\n"; + } + else{ + ### if the second best alignment has the same alignment score as the first one, the sequence will get booted + if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){ + # warn "Same alignment score, the sequence will get booted!\n"; + $sequence_fails = 1; + last; # exiting after the second alignment since we know that the sequence has ambiguous alignments + } + ### else we are going to store the best alignment for further processing + else{ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position}; + $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; + $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score}; + $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag}; + $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR}; + if (defined $alignments{$best_alignment_location}->{alignment_score_second_best} and $alignments{$best_alignment_location}-> {alignment_score_second_best} > $alignments{$alignment_location}->{alignment_score}) { + $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$best_alignment_location}->{alignment_score_second_best}; + } + else { + $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$alignment_location}->{alignment_score}; + } + last; # exiting after processing the second alignment since the sequence produced a unique best alignment + } + } + } + } + else{ + die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";; + } + + ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions + if ($sequence_fails == 1){ + $counting{unsuitable_sequence_count}++; + + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); + # print OUT "$ambiguous_read_output\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it has a unique best alignment + $counting{unique_best_alignment_count}++; + + ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well + extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params); + + ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + # Compute MAPQ value + $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence), undef, + $methylation_call_params->{$identifier}->{alignment_score}, + $methylation_call_params->{$identifier}->{alignment_score_second_best}); + + + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); + print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value); + return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out +} + + +sub determine_number_of_transliterations_performed{ + my ($sequence,$read_conversion) = @_; + my $number_of_transliterations; + if ($read_conversion eq 'CT'){ + $number_of_transliterations = $sequence =~ tr/C/T/; + } + elsif ($read_conversion eq 'GA'){ + $number_of_transliterations = $sequence =~ tr/G/A/; + } + else{ + die "Read conversion mode of the read was not specified $!\n"; + } + return $number_of_transliterations; +} + +sub decide_whether_single_end_alignment_is_valid{ + my ($index,$identifier) = @_; + + # extracting from Bowtie 1 format + my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1]; + + ### ensuring that the entry is the correct sequence + if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){ + ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically + ### sensible alignments + my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); + ### If the orientation was correct can we move on + if ($orientation == 1){ + return 1; ### 1st possibility for a sequence to pass + } + ### If the alignment was in the wrong orientation we need to read in a new line + elsif($orientation == 0){ + my $newline = $fhs[$index]->{fh}->getline(); + if ($newline){ + ($id,$strand) = (split (/\t/,$newline))[0,1]; + + ### ensuring that the next entry is still the correct sequence + if ($id eq $identifier){ + ### checking orientation again + $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); + ### If the orientation was correct can we move on + if ($orientation == 1){ + $fhs[$index]->{last_seq_id} = $id; + $fhs[$index]->{last_line} = $newline; + return 1; ### 2nd possibility for a sequence to pass + } + ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs + elsif ($orientation == 0){ + $newline = $fhs[$index]->{fh}->getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with + ### the same fields of the just read next entry + die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation + } + else{ + # assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs + else{ + $fhs[$index]->{last_seq_id} = $id; + $fhs[$index]->{last_line} = $newline; + return 0; # processing the new alignment result only in the next round + } + } + else { + # assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round + else{ + return 0; + } +} +######################### +### BOWTIE 1 | PAIRED-END +######################### + +sub check_bowtie_results_paired_ends{ + my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; + + ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 + unless ($quality_value_1){ + $quality_value_1 = 'I'x(length$sequence_1); + } + unless ($quality_value_2){ + $quality_value_2 = 'I'x(length$sequence_2); + } + + # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; + # sleep (1); + my %mismatches = (); + ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome + + + ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. + ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). + ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) + ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary + ### strands are not being reported by specifying --directional + + foreach my $index (0,3,1,2){ + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); + ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n"; + + ################################################################################## + ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ### + ################################################################################## + my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); + ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong + ### orientation. We only continue to extract useful information about this alignment if 1 was returned + if ($valid_alignment_found == 1){ + ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself. + ### we store the useful information in %mismatches + my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; + my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + + ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted + my ($chromosome_1,$chromosome_2); + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches_1; + my $number_of_mismatches_2; + if ($mismatch_info_1 eq ''){ + $number_of_mismatches_1 = 0; + } + elsif ($mismatch_info_1 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_1); + $number_of_mismatches_1 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + if ($mismatch_info_2 eq ''){ + $number_of_mismatches_2 = 0; + } + elsif ($mismatch_info_2 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_2); + $number_of_mismatches_2 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments + my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + die "Position 1 is higher than position 2" if ($position_1 > $position_2); + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ + $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; + } + ################################################################################################################################################### + ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ### + ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ### + ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ### + ### this round ### + ################################################################################################################################################### + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_2; + } + else{ + die "Either read 1 or read 2 needs to end on '/1'\n"; + } + + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else { + # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; # jumping to the next index + } + ### Now processing the entry we just stored in last_line_1 and last_line_2 + $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); + ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to + ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation + if ($valid_alignment_found == 1){ + ### we store the useful information in %mismatches + ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7]; + ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted) + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + $number_of_mismatches_1=''; + $number_of_mismatches_2=''; + ### Now extracting the number of mismatches to the converted genome + if ($mismatch_info_1 eq ''){ + $number_of_mismatches_1 = 0; + } + elsif ($mismatch_info_1 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_1); + $number_of_mismatches_1 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + if ($mismatch_info_2 eq ''){ + $number_of_mismatches_2 = 0; + } + elsif ($mismatch_info_2 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_2); + $number_of_mismatches_2 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments + $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + die "position 1 is greater than position 2" if ($position_1 > $position_2); + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ + $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; + } + ############################################################################################################################################### + ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ### + ############################################################################################################################################### + $newline_1 = $fhs[$index]->{fh}-> getline(); + $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_1; + } + if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_2; + } + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else { + # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; # jumping to the next index + } + ### within the 2nd sequence pair alignment in correct orientation found + } + ### within the 1st sequence pair alignment in correct orientation found + } + ### still within the (last_seq_id eq identifier) condition + } + ### still within foreach index loop + } + ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file + unless(%mismatches){ + $counting{no_single_alignment_found}++; + return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified + } + ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + my $sequence_pair_fails = 0; + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference! + ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the + ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the + ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether + foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ + #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n"; + foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){ + #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n"; + # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n"; + } + if (scalar keys %{$mismatches{$mismatch_number}} == 1){ + # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n"; + for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ + $methylation_call_params->{$identifier}->{seq_id} = $identifier; + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1}; + $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}; + $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2})); + $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1}; + $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2}; + } + } + else{ + $sequence_pair_fails = 1; + } + ### after processing the alignment with the lowest number of mismatches we exit + last; + } + ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions + if ($sequence_pair_fails == 1){ + $counting{unsuitable_sequence_count}++; + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified + } + if ($unmapped){ + return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it does have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params); + + ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); + $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); + + print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); + return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 +} + +######################### +### BOWTIE 2 | PAIRED-END +######################### + +sub check_bowtie_results_paired_ends_bowtie2{ + my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; + + ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 + unless ($quality_value_1){ + $quality_value_1 = 'I'x(length$sequence_1); + } + + unless ($quality_value_2){ + $quality_value_2 = 'I'x(length$sequence_2); + } + # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; + + my %alignments; + my $alignment_ambiguous = 0; + + my $first_ambig_alignment_line1; # storing the first ambiguous alignment so it can be written out in case '--ambig_bam' was specified R1 + my $first_ambig_alignment_line2; # R2 + + my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far + my $amb_same_thread = 0; ## if a read's primary and secondary alignments have the same alignment score we set this to true. + + ### reading from the Bowtie 2 output filehandles + + ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. + ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). + ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) + ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary + ### strands are not being reported when '--directional' is specified + + foreach my $index (0,3,1,2){ + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); + + ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + + my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10]; + my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10]; + # print "Index: $index\t$fhs[$index]->{last_line_1}\n"; + # print "Index: $index\t$fhs[$index]->{last_line_2}\n"; + # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n"; + # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n"; + $id_1 =~ s/\/1$//; + $id_2 =~ s/\/2$//; + + # SAM format specifications for Bowtie 2 + # (1) Name of read that aligned + # (2) Sum of all applicable flags. Flags relevant to Bowtie are: + # 1 The read is one of a pair + # 2 The alignment is one end of a proper paired-end alignment + # 4 The read has no reported alignments + # 8 The read is one of a pair and has no reported alignments + # 16 The alignment is to the reverse reference strand + # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand + # 64 The read is mate 1 in a pair + # 128 The read is mate 2 in a pair + # 256 The read has multiple mapping states + # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) + # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) + # (5) Mapping quality (255 means MAPQ is not available) + # (6) CIGAR string representation of alignment (* if unavailable) + # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. + # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. + # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. + # (10) Read sequence (reverse-complemented if aligned to the reverse strand) + # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. + # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: + # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. + # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. + # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. + # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. + # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. + # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. + # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. + # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. + + ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128). + ### We can store the next alignment and move on to the next Bowtie 2 instance + if ($flag_1 == 77 and $flag_2 == 141){ + ## reading in the next alignment, which must be the next sequence + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + + # print "current sequence ($identifier) did not map, reading in next sequence\n"; + # print "$index\t$fhs[$index]->{last_seq_id}\n"; + # print "$index\t$fhs[$index]->{last_line_1}\n"; + # print "$index\t$fhs[$index]->{last_line_2}\n"; + next; # next instance + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; + } + } + + ### If there are one or more proper alignments we can extract the chromosome number + my ($chromosome_1,$chromosome_2); + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + + ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string + my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2); + + my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1}); + my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2}); + + foreach (11..$#fields_1){ + if ($fields_1[$_] =~ /AS:i:(.*)/){ + $alignment_score_1 = $1; + } + elsif ($fields_1[$_] =~ /XS:i:(.*)/){ + $second_best_1 = $1; + } + elsif ($fields_1[$_] =~ /MD:Z:(.*)/){ + $MD_tag_1 = $1; + } + } + + foreach (11..$#fields_2){ + if ($fields_2[$_] =~ /AS:i:(.*)/){ + $alignment_score_2 = $1; + } + elsif ($fields_2[$_] =~ /XS:i:(.*)/){ + $second_best_2 = $1; + } + elsif ($fields_2[$_] =~ /MD:Z:(.*)/){ + $MD_tag_2 = $1; + } + } + + die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1); + die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2); + + # warn "First read 1 alignment score is: '$alignment_score_1'\n"; + # warn "First read 2 alignment score is: '$alignment_score_2'\n"; + # warn "MD tag 1 is: '$MD_tag_1'\n"; + # warn "MD tag 2 is: '$MD_tag_2'\n"; + + ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments + my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ; + # warn "sum of alignment scores: $sum_of_alignment_scores_1\n\n"; sleep(1); + + my $overwrite = 0; # If there are 2 alternative alignments to the same position, e.g. OT with 50 mismatches and CTOB with 0 mismatches, the CTOB one trumps the OT one. + # introduced 13 April 2016 as a suggestion by Sylvain Foret, ANU Canberra + + if (!defined $best_AS_so_far){ + $overwrite = 1; + $best_AS_so_far = $sum_of_alignment_scores_1; + # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n"; + if ($ambig_bam){ # also setting the first_ambig_alignment + # Read 1 + $first_ambig_alignment_line1 = $fhs[$index]->{last_line_1}; + $first_ambig_alignment_line1 =~ s/_(CT|GA)_converted//; + # Read 2 + $first_ambig_alignment_line2 = $fhs[$index]->{last_line_2}; + $first_ambig_alignment_line2 =~ s/_(CT|GA)_converted//; + # warn "$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n\n"; sleep(1); + } + } + else{ + if ($sum_of_alignment_scores_1 >= $best_AS_so_far){ # AS are generally negative with a maximum of 0 + # 19 07 2016 Changed to >= so that equally good alignments to different positions get added as well. Ambiguous alignments are identified and removed later. + $best_AS_so_far = $sum_of_alignment_scores_1; + $overwrite = 1; + + # warn "Found better or equal sum of alignment scores ($sum_of_alignment_scores_1), setting \$best_AS_so_far to $best_AS_so_far\n"; + # resetting the ambiguous within thread memory (if applicable at all) only if the current alignment is really better than the previous one. + # 22 07 2016: ambiguous score within same thread only resets if the current alignment is really better than the previous one + if ($sum_of_alignment_scores_1 > $best_AS_so_far){ + # warn "Resetting amb within thread value to 0\n"; + $amb_same_thread = 0; + + if ($ambig_bam){ # also setting a new first_ambig_alignment + # Read 1 + $first_ambig_alignment_line1 = $fhs[$index]->{last_line_1}; + $first_ambig_alignment_line1 =~ s/_(CT|GA)_converted//; + # Read 2 + $first_ambig_alignment_line2 = $fhs[$index]->{last_line_2}; + $first_ambig_alignment_line2 =~ s/_(CT|GA)_converted//; + # warn "$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n\n"; sleep(1); + } + } + } + else{ + # warn "current alignment (AS $sum_of_alignment_scores) isn't better than the best so far ($best_AS_so_far). Not changing anything\n"; + } + } + + if (defined $second_best_1 and defined $second_best_2){ + my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2; + # warn "Second best alignment_score_1 is: '$second_best_1'\n"; + # warn "Second best alignment_score_2 is: '$second_best_2'\n"; + # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n"; + + # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we we keep a memory of this + if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){ + + # checking to see if this read pair produced the best alignment + if ($sum_of_alignment_scores_1 == $best_AS_so_far){ # yes this is the best read pair so far, either within the thread or between threads, however it is ambiguous + # warn "Read pair is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n"; + $amb_same_thread = 1; + } + else{ + # warn "This read pair has a worse alignment score than the best alignment so far and will be ignored even though it is ambiguous in itself\n"; + } + + ### if there is a better alignment later on -> fine. If not, the read will get booted altogether one way or another + + ## need to read and discard all additional ambiguous reads until we reach the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + last; # break free if the end of the alignment output was reached + } + } + # if ($fhs[$index]->{last_seq_id}){ + # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + # } + } + else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment + + my $alignment_location; + if ($position_1 <= $position_2){ + $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + } + elsif($position_2 < $position_1){ + $alignment_location = join(":",$chromosome_1,$position_2,$position_1); + } + + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB + + if ($overwrite){ # see comment above at "my $overwrite = ..." + #unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id_1; + $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; + $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; + $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; + $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = $sum_of_alignment_scores_second_best; + $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; + $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine + $alignments{$alignment_location}->{position_1} = $position_1; + $alignments{$alignment_location}->{position_2} = $position_2; + $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; + $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; + $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; + $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; + $alignments{$alignment_location}->{flag_1} = $flag_1; + $alignments{$alignment_location}->{flag_2} = $flag_2; + # warn "added best of several alignments to \%alignments hash\n"; + } + + ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else{ + # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + last; # break free if the end of the alignment output was reached + } + } + # if($fhs[$index]->{last_seq_id}){ + # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n"; + # } + } + } + else{ # there is no second best hit, so we can just store this one and read in the next sequence + + my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + # print "$alignment_location\n"; + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB + + #unless (exists $alignments{$alignment_location}){ # see comment above at my $overwrite = ... + if ($overwrite){ + $alignments{$alignment_location}->{seq_id} = $id_1; + $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; + $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; + $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; + $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = undef; + $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; + $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine + $alignments{$alignment_location}->{position_1} = $position_1; + $alignments{$alignment_location}->{position_2} = $position_2; + $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; + $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; + $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; + $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; + $alignments{$alignment_location}->{flag_1} = $flag_1; + $alignments{$alignment_location}->{flag_2} = $flag_2; + # warn "added unique alignment to \%alignments hash\n"; + } + + # Now reading and storing the next read pair + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + # print "$newline_1\n"; + # print "$newline_2\n"; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + + if ($seq_id_1 eq $identifier){ + die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + } + else{ + # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + } + } + } + } + + ### If there were several equally good alignments for the best alignment score we will boot the read + if ($amb_same_thread){ + # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; + $alignment_ambiguous = 1; + # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; + } + else{ + # warn "alignment won't be considered ambiguous. This time....\n"; + } + + + ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format + if ($alignment_ambiguous == 1){ + $counting{unsuitable_sequence_count}++; + ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$ambiguous_read_1\n"; + # print "$ambiguous_read_2\n"; + + if ($ambig_bam){ + # warn "Sequence is ambiguous, printing out to ambiguous BAM file:\n"; + # replacing the first /1\t in the ID of R1 + # warn "Was\n$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n"; + $first_ambig_alignment_line1 =~ s/\/1\t/\t/; + $first_ambig_alignment_line2 =~ s/\/2\t/\t/; + # warn "Is:\n$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n\n"; + + print AMBIBAM "$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n"; + # print "$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n"; + } + + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; + } + } + + ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file + unless (%alignments){ + $counting{no_single_alignment_found}++; + + # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$unmapped_read_1\n"; + # print "$unmapped_read_2\n"; + if ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified + } + else{ + return 0; + } + } + + ####################################################################################################################################################### + + ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one + ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) + ### alignment score we are discarding the sequence pair altogether. + ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5) + ### and extending (3 per bp) the gap. + + ####################################################################################################################################################### + + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference + my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + + ### print contents of %alignments for debugging + ## if (scalar keys %alignments >= 1){ + # print "\n******\n"; + # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ + # print "Loc: $alignment_location\n"; + # print "ID: $alignments{$alignment_location}->{seq_id}\n"; + # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n"; + # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n"; + # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n"; + # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n"; + # print "Index $alignments{$alignment_location}->{index}\n"; + # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; + # print "Pos_1: $alignments{$alignment_location}->{position_1}\n"; + # print "Pos_2: $alignments{$alignment_location}->{position_2}\n"; + # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n"; + # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n"; + # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n"; + # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n"; + # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n"; + # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n"; + # } + # print "\n******\n"; + # } + + ### if there is only 1 entry in the %alignments hash we accept it as the best alignment + if (scalar keys %alignments == 1){ + for my $unique_best_alignment (keys %alignments){ + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1}; + $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2}; + $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1}; + $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores_second_best}; + $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1}; + $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2}; + $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1}; + $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2}; + $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1}; + $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2}; + } + } + + ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case + ### we boot the sequence pair altogether) + elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ + my $best_sum_of_alignment_scores; + my $best_alignment_location; + foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){ + + # warn "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; sleep(1); + + unless (defined $best_sum_of_alignment_scores){ + $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores}; + $best_alignment_location = $alignment_location; + # print "setting best alignment score to: $best_sum_of_alignment_scores\n"; + } + else{ + ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted + if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){ + # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n"; + $sequence_pair_fails = 1; + last; # exiting since we know that the sequence has ambiguous alignments + } + ### else we are going to store the best alignment for further processing + else{ + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; + $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1}; + $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2}; + $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; + $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1}; + $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores}; + $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1}; + $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2}; + $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1}; + $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2}; + $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1}; + $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2}; + + if (defined $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} and ( $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} > $alignments{$alignment_location}->{sum_of_alignment_scores} )) { + $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best}; + } + else { + $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$alignment_location}->{sum_of_alignment_scores}; + } + + last; # exiting since the sequence produced a unique best alignment + } + } + } + } + else{ + die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";; + } + + ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions + if ($sequence_pair_fails == 1){ + $counting{unsuitable_sequence_count}++; + + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # warn "$ambiguous_read_1\n"; + # warn "$ambiguous_read_2\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; # => exits to next sequence pair (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence pair has not been rejected so far it does have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); + + ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### Compute MAPQ value + $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence_1), length($sequence_2), + $methylation_call_params->{$identifier}->{sum_of_alignment_scores}, + $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best}); + + + ### now we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); + $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); + # warn "$methylation_call_params->{$identifier}->{read_conversion_2}\n"; + # warn " $sequence_2\n"; + # warn "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n"; + # warn " $methylation_call_params->{$identifier}->{methylation_call_2}\n"; + + print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); + return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 +} + +### + +# Compute MAPQ value for a read or read pair as in Bowtie2-2.2.2 (specifically, V2 of the MAPQ calculator: "class BowtieMapq2") +# assuming end-to-end alignment with the default calculation of the minimum alignment score + +sub calc_mapq { + my ($read1Len, $read2Len, $AS_best, $AS_secBest) = @_; + + my $scMin = $score_min_intercept + $score_min_slope * $read1Len; + ### read2Len is only defined for paired-end reads, so for single-end mode we can just a score min value for read 1 + if (defined $read2Len){ + $scMin += $score_min_intercept + $score_min_slope * $read2Len; + } + + my $diff = abs$scMin; # scores can vary by up to this much (since max AS is 0 for end-to-end alignment) + my $bestOver = $AS_best - $scMin; + + if (!defined $AS_secBest) { + if ($bestOver >= $diff * 0.8) { return 42; } + elsif ($bestOver >= $diff * 0.7) { return 40; } + elsif ($bestOver >= $diff * 0.6) { return 24; } + elsif ($bestOver >= $diff * 0.5) { return 23; } + elsif ($bestOver >= $diff * 0.4) { return 8; } + elsif ($bestOver >= $diff * 0.3) { return 3; } + else { return 0; } + } else { + my $bestDiff = abs(abs($AS_best) - abs($AS_secBest)); + if ($bestDiff >= $diff * 0.9) { + if ($bestOver == $diff) { + return 39; + } else { + return 33; + } + } elsif ($bestDiff >= $diff * 0.8) { + if ($bestOver == $diff) { + return 38; + } else { + return 27; + } + } elsif ($bestDiff >= $diff * 0.7) { + if ($bestOver == $diff) { + return 37; + } else { + return 26; + } + } elsif ($bestDiff >= $diff * 0.6) { + if ($bestOver == $diff) { + return 36; + } else { + return 22; + } + } elsif ($bestDiff >= $diff * 0.5) { + if ($bestOver == $diff) { + return 35; + } elsif ($bestOver >= $diff * 0.84) { + return 25; + } elsif ($bestOver >= $diff * 0.68) { + return 16; + } else { + return 5; + } + } elsif ($bestDiff >= $diff * 0.4) { + if ($bestOver == $diff) { + return 34; + } elsif ($bestOver >= $diff * 0.84) { + return 21; + } elsif ($bestOver >= $diff * 0.68) { + return 14; + } else { + return 4; + } + } elsif ($bestDiff >= $diff * 0.3) { + if ($bestOver == $diff) { + return 32; + } elsif ($bestOver >= $diff * 0.88) { + return 18; + } elsif ($bestOver >= $diff * 0.67) { + return 15; + } else { + return 3; + } + } elsif ($bestDiff >= $diff * 0.2) { + if ($bestOver == $diff) { + return 31; + } elsif ($bestOver >= $diff * 0.88) { + return 17; + } elsif ($bestOver >= $diff * 0.67) { + return 11; + } else { + return 0; + } + } elsif ($bestDiff >= $diff * 0.1) { + if ($bestOver == $diff) { + return 30; + } elsif ($bestOver >= $diff * 0.88) { + return 12; + } elsif ($bestOver >= $diff * 0.67) { + return 7; + } else { + return 0; + } + } elsif ($bestDiff > 0) { + if ($bestOver >= $diff * 0.67) { + return 6; + } else { + return 2; + } + } else { + if ($bestOver >= $diff * 0.67) { + return 1; + } else { + return 0; + } + } + } +} + + +### + +sub decide_whether_paired_end_alignment_is_valid{ + my ($index,$identifier) = @_; + my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; + my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + my $seq_id_1 = $id_1; + my $seq_id_2 = $id_2; + $seq_id_1 =~ s/\/1$//; # removing the read /1 + $seq_id_2 =~ s/\/1$//; # removing the read /1 + + ### ensuring that the current entry is the correct sequence + if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ + ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically + ### sensible alignments + my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); + ### If the orientation was correct can we move on + if ($orientation == 1){ + return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS + } + ### If the alignment was in the wrong orientation we need to read in two new lines + elsif($orientation == 0){ + my $newline_1 = $fhs[$index]->{fh}->getline(); + my $newline_2 = $fhs[$index]->{fh}->getline(); + if ($newline_1 and $newline_2){ + ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time) + ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1]; + ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1]; + + my $seqid; + $seq_id_1 = $id_1; + $seq_id_2 = $id_2; + # we need to capture the first read (ending on /1) + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_2; + } + else{ + die "One of the two reads needs to end on /1!!"; + } + + ### ensuring that the next entry is still the correct sequence + if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ + ### checking orientation again + $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); + ### If the orientation was correct can we move on + if ($orientation == 1){ + ### Writing the current sequence to last_line_1 and last_line_2 + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS + } + ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be + ### the next entry) + elsif ($orientation == 0){ + $newline_1 = $fhs[$index]->{fh}->getline(); + $newline_2 = $fhs[$index]->{fh}->getline(); + if ($newline_1 and $newline_2){ + ($seq_id_1) = split (/\t/,$newline_1); + ($seq_id_2) = split (/\t/,$newline_2); + + $seqid = ''; + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_2; + } + else{ + die "One of the two reads needs to end on /1!!"; + } + + ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with + ### the same fields of the just read next entry + die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier); + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + else { + ### assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs + else{ + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 0; # processing the new alignment result only in the next round + } + } + else { + # assigning undef to last_seq_id and both last_lines (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round + else{ + return 0; + } +} + +### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END + +sub extract_corresponding_genomic_sequence_paired_ends { + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + my $alignment_read_1; + my $alignment_read_2; + my $read_conversion_info_1; + my $read_conversion_info_2; + my $genome_conversion; + + ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call + ### if the C happens to be at the first or last position of the actually observed sequence + my $non_bisulfite_sequence_1; + my $non_bisulfite_sequence_2; + + ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was + ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic + ### sequences around! + ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_GA_CT_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'CT'; + ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) + ### for hits on the forward strand we need to capture 2 extra bases at the 3' end + + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) + ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1 + + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + else{ + $non_bisulfite_sequence_2 = ''; + } + } + + ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from complementary to (converted) reverse strand] + $counting{GA_CT_GA_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'GA'; + + ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) + ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end + if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1 + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2 + } + else{ + $non_bisulfite_sequence_1 = ''; + } + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) + ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + + ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from the complementary to (converted) forward strand] + $counting{GA_CT_CT_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'CT'; + + ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! + ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand + ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) + ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! + ### Read 2 is CT converted so we need to capture 2 extra 3' bases + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1 + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2 + } + else{ + $non_bisulfite_sequence_2 = ''; + } + } + + ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from the (converted) reverse strand] + $counting{CT_GA_GA_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'GA'; + + ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! + ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand + ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation + if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1 + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + } + else{ + $non_bisulfite_sequence_1 = ''; + } + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) + ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! + ### Read 2 is GA converted so we need to capture 2 extra 5' bases + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2 + } + else{ + die "Too many bowtie result filehandles\n"; + } + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + + $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; + $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; + $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; +} + +### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END + +sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{ + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; + my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; + my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; + my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; + + my $contains_deletion_1 = 0; + my $contains_deletion_2 = 0; + if ($cigar_1 =~ /D/){ + $contains_deletion_1 = 1; + if ($verbose){ warn "$cigar_1\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_1}\n";} + } + if ($cigar_2 =~ /D/){ + $contains_deletion_2 = 1; + if ($verbose){ warn "$cigar_2\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_2}\n";} + } + + # warn "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; + ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_read_1; + my $alignment_read_2; + my $read_conversion_info_1; + my $read_conversion_info_2; + my $genome_conversion; + + ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call + ### if the C happens to be at the last position of the actually observed sequence + my $non_bisulfite_sequence_1 = ''; + my $non_bisulfite_sequence_2 = ''; + my $genomic_seq_for_MD_tag_1 = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output + my $genomic_seq_for_MD_tag_2 = ''; + + ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings + my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1; + my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1; + + # parsing CIGAR 1 string + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); + # parsing CIGAR 2 string + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + + my $indels_1 = 0; # adding these to the hemming distance value (needed for the NM field in the final SAM output + my $indels_2 = 0; + + ### Extracting read 1 genomic sequence ### + + # extracting 2 additional bp at the 5' end (read 1) + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1; + return; + } + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2); + } + + foreach (0..$#len_1){ + if ($ops_1[$_] eq 'M'){ + # extracting genomic sequence + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); + } + # warn "$non_bisulfite_sequence_1\n"; + # adjusting position + $pos_1 += $len_1[$_]; + } + elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls, and we can later ignore it for the generation of the MD;Z: tag + $non_bisulfite_sequence_1 .= 'X' x $len_1[$_]; + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 .= 'X' x $len_1[$_]; + } + # warn "$non_bisulfite_sequence_1\n"; + # position doesn't need adjusting + + ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist() + # indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); + } + # warn "Just adjusting the position by: ",$len_1[$_],"bp\n"; + $pos_1 += $len_1[$_]; + $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + else{ + die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + } + + ### 3' end of read 1 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + return; + } + + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); + } + + + ### Extracting read 2 genomic sequence ### + + ### 5' end of read 2 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2; + return; + } + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2); + } + + foreach (0..$#len_2){ + if ($ops_2[$_] eq 'M'){ + # extracting genomic sequence + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); + } + # warn "$non_bisulfite_sequence_2\n"; + # adjusting position + $pos_2 += $len_2[$_]; + } + elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can ignore this later during the generation of the MD:Z: tag + $non_bisulfite_sequence_2 .= 'X' x $len_2[$_]; + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 .= 'X' x $len_2[$_]; + } + # warn "$non_bisulfite_sequence_2\n"; + # position doesn't need adjusting + + ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist() + # $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); + } + # warn "Just adjusting the position by: ",$len_2[$_],"bp\n"; + $pos_2 += $len_2[$_]; + $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + else{ + die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + } + + ### 3' end of read 2 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise + # need to set read 1 as well now to prevent warning + # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; + # sleep(5); + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + return; + } + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); + } + + ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was + ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly + + ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_GA_CT_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'CT'; + ### Read 1 is always the forward hit + ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2); + } + } + + ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from complementary to (converted) bottom strand] + $counting{GA_CT_GA_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'GA'; + ### Read 1 is always the forward hit + ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2); + } + } + + ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from the complementary to (converted) top strand] + $counting{GA_CT_CT_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'CT'; + + ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1); + } + } + + ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from the (converted) reverse strand] + $counting{CT_GA_GA_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'GA'; + ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1); + } + } + else{ + die "Too many bowtie result filehandles\n"; + } + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + + $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; + $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; + $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2; + + ## the end position of a read is stored in $pos + $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1; + $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2; + $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1; + $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2; +} + +########################################## +### PRINT SINGLE END RESULTS: Bowtie 1 ### +########################################## + +sub print_bisulfite_mapping_result_single_end{ + my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value = convert_phred64_quals_to_phred33($quality_value); + } + elsif ($solexa){ + $quality_value = convert_solexa_quals_to_phred33($quality_value); + } + + ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position. + $methylation_call_params->{$identifier}->{position} += 1; + + ### writing every uniquely mapped read and its methylation call to the output file + if ($vanilla){ + my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value); + print OUT "$bowtie1_output\n"; + } + else{ # SAM output, default since Bismark v1.0.0 + single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script + } +} + +########################################## +### PRINT SINGLE END RESULTS: Bowtie 2 ### +########################################## + +sub print_bisulfite_mapping_result_single_end_bowtie2{ + my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value = convert_phred64_quals_to_phred33($quality_value); + } + elsif ($solexa){ + $quality_value = convert_solexa_quals_to_phred33($quality_value); + } + + ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed) + single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script +} + +########################################## +### PRINT PAIRED END ESULTS: Bowtie 1 ### +########################################## + +sub print_bisulfite_mapping_results_paired_ends{ + my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); + } + elsif ($solexa){ + $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); + } + + ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based) + $methylation_call_params->{$identifier}->{start_seq_1} += 1; + + ### writing every single aligned read and its methylation call to the output file + if ($vanilla){ + my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2); + print OUT "$bowtie1_output_paired_end\n"; + } + else{ # SAM output, default since Bismark v1.0.0 + paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script + } + +} + +########################################## +### PRINT PAIRED END ESULTS: Bowtie 2 ### +########################################## + +sub print_bisulfite_mapping_results_paired_ends_bowtie2{ + my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); + } + elsif ($solexa){ + $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); + } + + ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed) + paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script + +} + + +sub convert_phred64_quals_to_phred33{ + + my $qual = shift; + my @quals = split (//,$qual); + my @new_quals; + + foreach my $index (0..$#quals){ + my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]); + my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); + $new_quals[$index] = $phred33_quality_string; + } + + my $phred33_quality = join ("",@new_quals); + return $phred33_quality; +} + +sub convert_solexa_quals_to_phred33{ + + my $qual = shift; + my @quals = split (//,$qual); + my @new_quals; + + foreach my $index (0..$#quals){ + my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]); + my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); + $new_quals[$index] = $phred33_quality_string; + } + + my $phred33_quality = join ("",@new_quals); + return $phred33_quality; +} + +sub convert_phred_score_into_phred33_quality_string{ + my $qual = shift; + $qual = chr($qual+33); + return $qual; +} + +sub convert_phred64_quality_string_into_phred_score{ + my $string = shift; + my $qual = ord($string)-64; + return $qual; +} + +sub convert_solexa_pre1_3_quality_string_into_phred_score{ + ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10 + my $string = shift; + my $qual = ord($string)-59; + return $qual; +} + + +sub extract_corresponding_genomic_sequence_single_end { + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_strand; + my $read_conversion_info; + my $genome_conversion; + ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, + ### if the C happens to be at the last position of the actually observed sequence + my $non_bisulfite_sequence; + ### depending on the conversion we want to make need to capture 1 extra base at the 3' end + + my $pbat_index_modifier = 0; + + if ($pbat){ + $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1! + } + + ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) + if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_CT_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'CT'; + $genome_conversion = 'CT'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 + ### + 2 extra base at the 3' end + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){ + ### [Index 1, sequence originated from (converted) reverse strand] + $counting{CT_GA_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'CT'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > + ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){ + ### [Index 2, sequence originated from complementary to (converted) forward strand] + $counting{GA_CT_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'GA'; + $genome_conversion = 'CT'; + + ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){ + ### [Index 3, sequence originated from complementary to (converted) reverse strand] + $counting{GA_GA_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'GA'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > + ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + else{ + die "Too many bowtie result filehandles\n"; + } + + $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; + $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + + ### at this point we can also determine the end position of a read + $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); +} + + +sub extract_corresponding_genomic_sequence_single_end_bowtie2{ + my ($sequence_identifier,$methylation_call_params) = @_; + + my $MD_tag = $methylation_call_params->{$sequence_identifier}->{MD_tag}; + my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR}; + + my $contains_deletion = 0; + if ($cigar =~ /D/){ + $contains_deletion = 1; + # warn "$cigar\n$MD_tag\n"; + } + ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_strand; + my $read_conversion_info; + my $genome_conversion; + + ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence + my $non_bisulfite_sequence = ''; + my $genomic_seq_for_MD_tag = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output + + ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings + my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my $pbat_index_modifier = 0; + + if ($pbat){ + $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1! + } + + ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3) + if ( (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1) or (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; + return; + } + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2); + } + + my $indels = 0; + + foreach (0..$#len){ + if ($ops[$_] eq 'M'){ + #extracting genomic sequence + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); + if ($contains_deletion){ + $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); + } + # adjusting position + $pos += $len[$_]; + } + elsif ($ops[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can later ignore it better during the generation of the MD:Z-tag + $non_bisulfite_sequence .= 'X' x $len[$_]; + if ($contains_deletion){ + $genomic_seq_for_MD_tag .= 'X' x $len[$_]; + } + # warn "$non_bisulfite_sequence\n"; + # position doesn't need to be adjusting + + ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail the base by base comparison in hemming_dist() + # $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) + } + elsif ($ops[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + + # we do however add the genomic sequence to the $genomic_sequence for MD-tag determination if the CIGAR string contained a deletion + if ($contains_deletion){ + $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); + } + $pos += $len[$_]; + $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) + } + elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + else{ + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + } + + ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2) + if ( ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0) or ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; + return; + } + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2); + # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n"; + } + + + ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) + if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_CT_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'CT'; + $genome_conversion = 'CT'; + } + + ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){ + ### [Index 1, sequence originated from (converted) reverse strand] + $counting{CT_GA_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'CT'; + $genome_conversion = 'GA'; + + ### reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + if ($contains_deletion){ + $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag); + } + } + + ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){ + ### [Index 2, sequence originated from complementary to (converted) forward strand] + $counting{GA_CT_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'GA'; + $genome_conversion = 'CT'; + + ### reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + if ($contains_deletion){ + $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag); + } + } + + ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){ + ### [Index 3, sequence originated from complementary to (converted) reverse strand] + $counting{GA_GA_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'GA'; + $genome_conversion = 'GA'; + + } + else{ + die "Too many Bowtie 2 result filehandles\n"; + } + + $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; + $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; + + # if ($contains_deletion){ + # warn "non-bis: $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence}\n"; + # warn "MD-seq: $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag}\n"; + # } + + ### the end position of a read is stored in $pos + $methylation_call_params->{$sequence_identifier}->{end_position} = $pos; + $methylation_call_params->{$sequence_identifier}->{indels} = $indels; +} + +### METHYLATION CALL + +sub methylation_call{ + my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_; + ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one + my @seq = split(//,$sequence_actually_observed); + my @genomic = split(//,$genomic_sequence); + # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n"; + ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either + ### CpG, CHH or CHG context + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ### U for methylated C in unknown context (was protected) ### + ### u for not methylated C in unknwon context (was converted) ### + ################################################################# + + my @match =(); + warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 + my $methyl_CHH_count = 0; + my $methyl_CHG_count = 0; + my $methyl_CpG_count = 0; + my $methyl_C_unknown_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CpG_count = 0; + my $unmethylated_C_unknown_count = 0; + + if ($read_conversion eq 'CT'){ + for my $index (0..$#seq) { + if ($seq[$index] eq $genomic[$index]) { + ### The residue can only be a C if it was not converted to T, i.e. protected my methylation + if ($genomic[$index] eq 'C') { + ### If the residue is a C we want to know if it was in CpG context or in any other context + my $downstream_base = $genomic[$index+1]; + + if ($downstream_base eq 'G'){ + ++$methyl_CpG_count; + push @match,'Z'; # protected C, methylated, in CpG context + } + elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$methyl_C_unknown_count; + push @match,'U'; # protected C, methylated, in Unknown context + } + else { + ### C in not in CpG-context, determining the second downstream base context + my $second_downstream_base = $genomic[$index+2]; + + if ($second_downstream_base eq 'G'){ + ++$methyl_CHG_count; + push @match,'X'; # protected C, methylated, in CHG context + } + elsif ($second_downstream_base eq 'N'){ + ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'U'; # protected C, methylated, in Unknown context + } + else{ + ++$methyl_CHH_count; + push @match,'H'; # protected C, methylated, in CHH context + } + } + } + else { + push @match, '.'; + } + } + elsif ($seq[$index] ne $genomic[$index]) { + ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts + ### in the actually observed sequence + if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') { + ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context + my $downstream_base = $genomic[$index+1]; + + if ($downstream_base eq 'G'){ + ++$unmethylated_CpG_count; + push @match,'z'; # converted C, not methylated, in CpG context + } + elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$unmethylated_C_unknown_count; + push @match,'u'; # converted C, not methylated, in Unknown context + } + else{ + ### C in not in CpG-context, determining the second downstream base context + my $second_downstream_base = $genomic[$index+2]; + + if ($second_downstream_base eq 'G'){ + ++$unmethylated_CHG_count; + push @match,'x'; # converted C, not methylated, in CHG context + } + elsif ($second_downstream_base eq 'N'){ + ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'u'; # converted C, not methylated, in Unknown context + } + else{ + ++$unmethylated_CHH_count; + push @match,'h'; # converted C, not methylated, in CHH context + } + } + } + ### all other mismatches are not of interest for a methylation call + else { + push @match,'.'; + } + } + else{ + die "There can be only 2 possibilities\n"; + } + } + } + elsif ($read_conversion eq 'GA'){ + # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n"; + + for my $index (0..$#seq) { + if ($seq[$index] eq $genomic[$index+2]) { + ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation + if ($genomic[$index+2] eq 'G') { + ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need + ### to look if the base upstream is a C + + my $upstream_base = $genomic[$index+1]; + + if ($upstream_base eq 'C'){ + ++$methyl_CpG_count; + push @match,'Z'; # protected C on opposing strand, methylated, in CpG context + } + elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$methyl_C_unknown_count; + push @match,'U'; # protected C on opposing strand, methylated, in Unknown context + } + else{ + ### C in not in CpG-context, determining the second upstream base context + my $second_upstream_base = $genomic[$index]; + + if ($second_upstream_base eq 'C'){ + ++$methyl_CHG_count; + push @match,'X'; # protected C on opposing strand, methylated, in CHG context + } + elsif ($second_upstream_base eq 'N'){ + ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'U'; # protected C, methylated, in Unknown context + } + else{ + ++$methyl_CHH_count; + push @match,'H'; # protected C on opposing strand, methylated, in CHH context + } + } + } + else{ + push @match, '.'; + } + } + elsif ($seq[$index] ne $genomic[$index+2]) { + ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts + ### on the opposing strand, so G to A conversions in the actually observed sequence + if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') { + ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if + ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream! + + my $upstream_base = $genomic[$index+1]; + + if ($upstream_base eq 'C'){ + ++$unmethylated_CpG_count; + push @match,'z'; # converted C on opposing strand, not methylated, in CpG context + } + elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$unmethylated_C_unknown_count; + push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context + } + else{ + ### C in not in CpG-context, determining the second upstream base context + my $second_upstream_base = $genomic[$index]; + + if ($second_upstream_base eq 'C'){ + ++$unmethylated_CHG_count; + push @match,'x'; # converted C on opposing strand, not methylated, in CHG context + } + elsif ($second_upstream_base eq 'N'){ + ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context + } + else{ + ++$unmethylated_CHH_count; + push @match,'h'; # converted C on opposing strand, not methylated, in CHH context + } + } + } + ### all other mismatches are not of interest for a methylation call + else { + push @match,'.'; + } + } + else{ + die "There can be only 2 possibilities\n"; + } + } + } + else{ + die "Strand conversion info is required to perform a methylation call\n"; + } + + my $methylation_call = join ("",@match); + + $counting{total_meCHH_count} += $methyl_CHH_count; + $counting{total_meCHG_count} += $methyl_CHG_count; + $counting{total_meCpG_count} += $methyl_CpG_count; + $counting{total_meC_unknown_count} += $methyl_C_unknown_count; + $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; + $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; + $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; + $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; + + # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; + + return $methylation_call; +} + +sub read_genome_into_memory{ + + ## working directoy + my $cwd = shift; + + ## reading in and storing the specified genome in the %chromosomes hash + chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; + warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; + + my @chromosome_filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@chromosome_filenames){ + @chromosome_filenames = <*.fasta>; + } + + unless (@chromosome_filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; + } + + my $SQ_count = 0; + + foreach my $chromosome_filename (@chromosome_filenames){ + + open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; + ### first line needs to be a fastA header + my $first_line = ; + chomp $first_line; + $first_line =~ s/\r//; + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + my $sequence; + + while (){ + chomp; + $_ =~ s/\r//; # removing carriage returns if present + if ($_ =~ /^>/){ + + ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) + if (exists $chromosomes{$chromosome_name}){ + print "chr $chromosome_name (",length $sequence ," bp)\n"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else { + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; + } + print "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + $SQ_order{$SQ_count} = $chromosome_name; + + ++$SQ_count; + + } + ### resetting the sequence variable + $sequence = ''; + ### setting new chromosome name + $chromosome_name = extract_chromosome_name($_); + } + else{ + $sequence .= uc$_; + } + } + + ### Processing last chromosome of a multi Fasta File or the only entry in case of single entry FastA files + + if (exists $chromosomes{$chromosome_name}){ + print "chr $chromosome_name (",length $sequence ," bp)\t"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; + } + else{ + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; + } + + ++$SQ_count; + + print "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + $SQ_order{$SQ_count} = $chromosome_name; + } + } + print "\n"; + chdir $cwd or die "Failed to move to directory $cwd\n"; + + ### If no single multi-FastA genome file was specified explicitely we will generate one here and write it to the output directory + if ($cram){ + unless (defined $cram_ref){ + warn "Reconstituting a single multi-FastA genome file as CRAM reference (you may specify such a file using --cram_ref explicitely to prevent this behaviour)\n"; + + $cram_ref = "${output_dir}Bismark_genome_CRAM_reference.mfa"; + warn "Writing multi-FastA file to $cram_ref\n"; + open (REF,'>',"$cram_ref") or die "Failed to write to file $cram_ref\n"; + foreach my $chr(keys %chromosomes){ + print REF ">$chr\n$chromosomes{$chr}\n"; + } + warn "Complete\n"; + close REF or warn "Failed to close filehandle REF: $!\n"; + } + + } + + + +} + +sub extract_chromosome_name { + ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well + my $fasta_header = shift; + if ($fasta_header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$fasta_header); + return $chromosome_name; + } + else{ + die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; + } +} + +sub reverse_complement{ + my $sequence = shift; + $sequence =~ tr/CATG/GTAC/; + $sequence = reverse($sequence); + return $sequence; +} + +sub biTransformFastAFiles { + my $file = shift; + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"gunzip -c $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + if ($gzip){ + $C_to_T_infile =~ s/$/_C_to_T.fa.gz/; + $G_to_A_infile =~ s/$/_G_to_A.fa.gz/; + } + else{ + $C_to_T_infile =~ s/$/_C_to_T.fa/; + $G_to_A_infile =~ s/$/_G_to_A.fa/; + } + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + + unless ($directional){ + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + + my $count = 0; + + while (1){ + my $header = ; + my $sequence= ; + last unless ($header and $sequence); + + $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($header,"\t") != -1){ + $seqID_contains_tabs++; + } + + ### small check if the sequence seems to be in FastA format + die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); + + my $sequence_C_to_T = $sequence; + $sequence_C_to_T =~ tr/C/T/; + print CTOT "$header$sequence_C_to_T"; + + unless ($directional){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA "$header$sequence_G_to_A"; + } + } + close CTOT or die "Failed to close filehandle $!\n"; + + if ($directional){ + warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + else{ + close GTOA or die "Failed to close filehandle $!\n"; + warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + return ($C_to_T_infile,$G_to_A_infile); +} + +sub biTransformFastAFiles_paired_end { + my ($file,$read_number) = @_; + + if ($gzip){ + warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n"; + sleep (2); + } + + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"gunzip -c $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + $C_to_T_infile =~ s/$/_C_to_T.fa/; + $G_to_A_infile =~ s/$/_G_to_A.fa/; + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + elsif ($read_number == 2){ + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + else{ + die "Read number needs to be 1 or 2, but was: $read_number\n\n"; + } + } + else{ # all four strand output + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + + my $count = 0; + + while (1){ + my $header = ; + my $sequence= ; + last unless ($header and $sequence); + + $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($header,"\t") != -1){ + $seqID_contains_tabs++; + } + + ## small check if the sequence seems to be in FastA format + die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/); + + if ($read_number == 1){ + if ($bowtie2){ + $header =~ s/$/\/1\/1/; + } + else{ + $header =~ s/$/\/1/; + } + } + elsif ($read_number == 2){ + if ($bowtie2){ + $header =~ s/$/\/2\/2/; + } + else{ + $header =~ s/$/\/2/; + } + } + else{ + die "Read number needs to be 1 or 2, but was: $read_number\n\n"; + } + my $sequence_C_to_T = my $sequence_G_to_A = $sequence; + + $sequence_C_to_T =~ tr/C/T/; + $sequence_G_to_A =~ tr/G/A/; + + if ($directional){ + + if ($read_number == 1){ + print CTOT "$header$sequence_C_to_T"; + } + elsif ($read_number == 2){ + print GTOA "$header$sequence_G_to_A"; + } + } + else{ + print CTOT "$header$sequence_C_to_T"; + print GTOA "$header$sequence_G_to_A"; + } + } + + if ($directional){ + if ($read_number == 1){ + warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n"; + } + else{ + warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n"; + } + } + else{ + warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + return ($C_to_T_infile); + } + else{ + return ($G_to_A_infile); + } + } + else{ + return ($C_to_T_infile,$G_to_A_infile); + } +} + + +sub biTransformFastQFiles { + my $file = shift; + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"gunzip -c $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + if ($pbat){ # PBAT-Seq + if ($gzip){ + $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; + } + else{ + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + } + + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + else{ # directional or non-directional + if ($gzip){ + $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; + } + else{ + $C_to_T_infile =~ s/$/_C_to_T.fastq/; + } + + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option + } + + unless ($directional){ + if ($gzip){ + $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; + } + else{ + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + } + + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + } + + my $count = 0; + while (1){ + my $identifier = ; + my $sequence = ; + my $identifier2 = ; + my $quality_score = ; + last unless ($identifier and $sequence and $identifier2 and $quality_score); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($identifier,"\t") != -1){ + $seqID_contains_tabs++; + } + + ## small check if the sequence file appears to be a FastQ file + if ($count == 1){ + if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ + die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + } + + if ($pbat){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + else{ # directional or non-directional + my $sequence_C_to_T = $sequence; + $sequence_C_to_T =~ tr/C/T/; + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + + unless ($directional){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + } + + if ($directional){ + close CTOT or die "Failed to close filehandle $!\n"; + warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + elsif($pbat){ + warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; + close GTOA or die "Failed to close filehandle $!\n"; + return ($G_to_A_infile); + } + else{ + close CTOT or die "Failed to close filehandle $!\n"; + close GTOA or die "Failed to close filehandle $!\n"; + warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; + } + + return ($C_to_T_infile,$G_to_A_infile); +} + +sub biTransformFastQFiles_paired_end { + my ($file,$read_number) = @_; + my ($dir,$filename); + + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"gunzip -c $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + if ($gzip){ + $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; + $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; + } + else{ + $C_to_T_infile =~ s/$/_C_to_T.fastq/; + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + } + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + } + elsif ($read_number == 2){ + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + else{ + die "Read number needs to be 1 or 2, but was $read_number!\n\n"; + } + } + else{ + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + + my $count = 0; + while (1){ + my $identifier = ; + my $sequence = ; + my $identifier2 = ; + my $quality_score = ; + last unless ($identifier and $sequence and $identifier2 and $quality_score); + ++$count; + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence= uc$sequence; # make input file case insensitive + + ## small check if the sequence file appears to be a FastQ file + if ($count == 1){ + if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ + die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + } + my $sequence_C_to_T = my $sequence_G_to_A = $sequence; + + if ($read_number == 1){ + if ($bowtie2){ + $identifier =~ s/$/\/1\/1/; + } + else{ + $identifier =~ s/$/\/1/; + } + } + elsif ($read_number == 2){ + if ($bowtie2){ + $identifier =~ s/$/\/2\/2/; + } + else{ + $identifier =~ s/$/\/2/; + } + } + else{ + die "Read number needs to be 1 or 2\n"; + } + + $sequence_C_to_T =~ tr/C/T/; + $sequence_G_to_A =~ tr/G/A/; + + if ($directional){ + if ($read_number == 1){ + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + } + else{ + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + else{ + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + + if ($directional){ + if ($read_number == 1){ + warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + else{ + warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + } + else{ + warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; + } + if ($directional){ + if ($read_number == 1){ + close CTOT or die "Failed to close filehandle $!\n"; + return ($C_to_T_infile); + } + else{ + close GTOA or die "Failed to close filehandle $!\n"; + return ($G_to_A_infile); + } + } + else{ + close CTOT or die "Failed to close filehandle $!\n"; + close GTOA or die "Failed to close filehandle $!\n"; + return ($C_to_T_infile,$G_to_A_infile); + } +} + + +### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES + +sub biTransformFastQFiles_paired_end_bowtie1_gzip { + my ($file_1,$file_2) = @_; + my ($dir,$filename); + + if ($file_1 =~ /\//){ + ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file_1; + } + + ### gzipped version of infile 1 + if ($file_1 =~ /\.gz$/){ + open (IN_1,"gunzip -c $file_1 |") or die "Couldn't read from file $file_1: $!\n"; + } + else{ + open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n"; + } + ### gzipped version of infile 2 + if ($file_2 =~ /\.gz$/){ + open (IN_2,"gunzip -c $file_2 |") or die "Couldn't read from file $file_2: $!\n"; + } + else{ + open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n"; + } + + + if ($skip){ + warn "Skipping the first $skip reads from $file_1 and $file_2\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n"; + sleep (1); + } + + my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; + $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; + $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; + # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; + } + + $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; + $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; + # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; + + warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; + open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; + # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; + + unless ($directional){ + print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n"; + open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n"; + } + + ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format: + ### + + my $count = 0; + while (1){ + my $identifier_1 = ; + my $sequence_1 = ; + my $identifier2_1 = ; + my $quality_score_1 = ; + + my $identifier_2 = ; + my $sequence_2 = ; + my $identifier2_2 = ; + my $quality_score_2 = ; + + last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2); + + ++$count; + + ## small check if the sequence file appears to be a FastQ file + if ($count == 1){ + if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){ + die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){ + die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + } + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + chomp $identifier_1; + chomp $sequence_1; + chomp $sequence_2; + chomp $quality_score_1; + chomp $quality_score_2; + + $identifier_1 =~ s/^\@//; + $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever) + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence_1 = uc$sequence_1; # make input file 1 case insensitive + $sequence_2 = uc$sequence_2; # make input file 2 case insensitive + + # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n"; + my $sequence_1_C_to_T = $sequence_1; + my $sequence_2_G_to_A = $sequence_2; + $sequence_1_C_to_T =~ tr/C/T/; + $sequence_2_G_to_A =~ tr/G/A/; + + print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n"; + + unless ($directional){ + my $sequence_1_G_to_A = $sequence_1; + my $sequence_2_C_to_T = $sequence_2; + $sequence_1_G_to_A =~ tr/G/A/; + $sequence_2_C_to_T =~ tr/C/T/; + print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n"; + } + } + + close CTPLUSGA or die "Couldn't close filehandle\n"; + warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n"; + + if ($directional){ + warn "\n"; + return ($CT_plus_GA_infile); + } + else{ + close GAPLUSCT or die "Couldn't close filehandle\n"; + warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n"; + return ($CT_plus_GA_infile,$GA_plus_CT_infile); + } +} + + +sub fix_IDs{ + my $id = shift; + $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores + return $id; +} + +sub ensure_sensical_alignment_orientation_single_end{ + my $index = shift; # index number if the sequence produced an alignment + my $strand = shift; + ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one + my $orientation = 0; + ############################################################################################################## + ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T) + ## here we only want reads in the forward (+) orientation + if ($fhs[$index]->{name} eq 'CTreadCTgenome') { + ### if the alignment is (+) we count it, and return 1 for a correct orientation + if ($strand eq '+') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (-) the alignment is nonsensical + elsif ($strand eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A) + ## here we only want reads in the forward (-) orientation + elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') { + ### if the alignment is (-) we count it and return 1 for a correct orientation + if ($strand eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (+) the alignment is nonsensical + elsif ($strand eq '+') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T) + ## here we only want reads in the forward (-) orientation + elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') { + ### if the alignment is (-) we count it and return 1 for a correct orientation + if ($strand eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (+) the alignment is nonsensical + elsif ($strand eq '+') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A) + ## here we only want reads in the forward (+) orientation + elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') { + ### if the alignment is (+) we count it and return 1 for a correct orientation + if ($strand eq '+') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (-) the alignment is nonsensical + elsif ($strand eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } else{ + die "One of the above conditions must be true\n"; + } +} + +sub ensure_sensical_alignment_orientation_paired_ends{ + my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment + ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one + my $orientation = 0; + ############################################################################################################## + ## [Index 0, sequence originated from (converted) forward strand] + ## CT converted read 1 + ## GA converted read 2 + ## CT converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') { + ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation + if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 1, sequence originated from (converted) reverse strand] + ## GA converted read 1 + ## CT converted read 2 + ## GA converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') { + ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation + if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 2, sequence originated from complementary to (converted) forward strand] + ## GA converted read 1 + ## CT converted read 2 + ## CT converted genome + ## here we only want read 1 in (-) orientation and read 2 in (+) orientation + elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') { + ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation + if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 3, sequence originated from complementary to (converted) reverse strand] + ## CT converted read 1 + ## GA converted read 2 + ## GA converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') { + ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation + if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + else{ + die "One of the above conditions must be true\n"; + } +} + +##################################################################################################################################################### + +### Bowtie 1 (default) | PAIRED-END | FASTA + +sub paired_end_align_fragments_to_bisulfite_genome_fastA { + + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + + if ($directional){ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + my $line_1 = $fh->{fh}->getline(); + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 2 | PAIRED-END | FASTA + +sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + if ($directional){ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + my $line_1 = $_; + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 1 (default) | PAIRED-END | FASTQ + +sub paired_end_align_fragments_to_bisulfite_genome_fastQ { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + + if ($directional){ + warn "Input file is $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; + } + elsif($pbat){ + warn "Input file is $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ; PBAT-Seq)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 and $G_to_A_infile_1 and $C_to_T_infile_2 (non-directional; FastQ)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional or $pbat){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; # skipping unwanted filehandles + } + } + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + if ($gzip){ + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n"; + open ($fh->{fh},"gunzip -c ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!"; + } + else{ + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n"; + sleep(5); + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + } + + my $line_1 = $fh->{fh}->getline(); + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 2 | PAIRED-END | FASTQ + +sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { + + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + if ($directional){ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; + } + elsif ($pbat){ + warn "Input files are $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; + } + + ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional or $pbat){ # skipping unwanted filehandles + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + my $line_1 = $_; + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +##################################################################################################################################################### + +### Bowtie 1 (default) | SINGLE-END | FASTA +sub single_end_align_fragments_to_bisulfite_genome_fastA { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; + if ($gzip){ + open ($fh->{fh},"gunzip -c $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; + } + else{ + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data + } + + # if Bowtie produces an alignment we store the first line of the output + $_ = $fh->{fh}->getline(); + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +### Bowtie 2 | SINGLE-END | FASTA +sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie 2: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + + +### Bowtie 1 (default) | SINGLE-END | FASTQ +sub single_end_align_fragments_to_bisulfite_genome_fastQ { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastQ)\n"; + } + elsif($pbat){ + warn "Input file is $G_to_A_infile (FastQ)\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n"; + } + + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## the data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; + sleep (5); + + if ($gzip){ + open ($fh->{fh},"gunzip -c $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; + } + else{ + open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data + } + + # if Bowtie produces an alignment we store the first line of the output + $_ = $fh->{fh}->getline(); + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +### Bowtie 2 | SINGLE-END | FASTQ +sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { + + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastQ)\n\n"; + } + elsif ($pbat){ + warn "Input file is $G_to_A_infile (FastQ)\n\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n"; + } + + ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## the data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n"; + warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n"; + + open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + # warn "$_\n"; + # sleep(1); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else { + last; + } + } + + # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + # warn "storing $id and\n$_\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +########################################################################################################################################### + +sub reset_counters_and_fhs{ + my $filename = shift; + %counting=( + total_meCHH_count => 0, + total_meCHG_count => 0, + total_meCpG_count => 0, + total_meC_unknown_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CpG_count => 0, + total_unmethylated_C_unknown_count => 0, + sequences_count => 0, + no_single_alignment_found => 0, + unsuitable_sequence_count => 0, + genomic_sequence_could_not_be_extracted_count => 0, + unique_best_alignment_count => 0, + low_complexity_alignments_overruled_count => 0, + CT_CT_count => 0, #(CT read/CT genome, original top strand) + CT_GA_count => 0, #(CT read/GA genome, original bottom strand) + GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand) + GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand) + CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand) + GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand) + GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand) + CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand) + alignments_rejected_count => 0, # only relevant if --directional was specified + ); + + if ($directional){ + if ($filename =~ ','){ # paired-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + else{ # single-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + } + elsif($pbat){ + if ($filename =~ ','){ # paired-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + else{ # single-end files + @fhs=( + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + } + else{ + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } +} + + +sub process_command_line{ + my @bowtie_options; + my $help; + my $mates1; + my $mates2; + my $path_to_bowtie; + my $fastq; + my $fasta; + my $skip; + my $qupto; + my $phred64; + my $phred33; + my $solexa; + my $mismatches; + my $seed_length; + my $best; + my $sequence_format; + my $version; + my $quiet; + my $chunk; + my $non_directional; + my $ceiling; + my $maxins; + my $minins; + my $unmapped; + my $multi_map; + my $output_dir; + my $bowtie2; + my $vanilla; + my $sam_no_hd; + my $seed_extension_fails; + my $reseed_repetitive_seeds; + my $most_valid_alignments; + my $score_min; + my $parallel; + my $temp_dir; + my $rdg; + my $rfg; + my $non_bs_mm; + my $samtools_path; + my $bam; + my $gzip; + my $pbat; + my $prefix; + my $old_flag; + my $basename; + my $sam; + my $multicore; + my $bowtie1; + my $rg_tag; + my $rg_id; + my $rg_sample; + my $genome_folder; + my $singles; + my $ambig_bam; + my $cram; + my $cram_ref; + my $nucleotide_coverage; + my $dovetail; + + my $command_line = GetOptions ('help|man' => \$help, + '1=s' => \$mates1, + '2=s' => \$mates2, + 'path_to_bowtie=s' => \$path_to_bowtie, + 'genome_folder=s' => \$genome_folder, + 'f|fasta' => \$fasta, + 'q|fastq' => \$fastq, + 's|skip=i' => \$skip, + 'u|upto=i' => \$qupto, + 'phred33-quals' => \$phred33, + 'phred64-quals|solexa1' => \$phred64, + 'solexa-quals' => \$solexa, + 'n|seedmms=i' => \$mismatches, + 'l|seedlen=i' => \$seed_length, + 'no_best' => \$best, + 'version' => \$version, + 'quiet' => \$quiet, + 'chunkmbs=i' => \$chunk, + 'non_directional' => \$non_directional, + 'I|minins=i' => \$minins, + 'X|maxins=i' => \$maxins, + 'e|maqerr=i' => \$ceiling, + 'un|unmapped' => \$unmapped, + 'ambiguous' => \$multi_map, + 'o|output_dir=s' => \$output_dir, + 'bowtie2' => \$bowtie2, + 'bowtie1' => \$bowtie1, + 'vanilla' => \$vanilla, + 'sam-no-hd' => \$sam_no_hd, + 'D=i' => \$seed_extension_fails, + 'R=i' => \$reseed_repetitive_seeds, + 'score_min=s' => \$score_min, + 'most_valid_alignments=i' => \$most_valid_alignments, + 'p=i' => \$parallel, + 'temp_dir=s' => \$temp_dir, + 'rdg=s' => \$rdg, + 'rfg=s' => \$rfg, + 'non_bs_mm' => \$non_bs_mm, + 'samtools_path=s' => \$samtools_path, + 'bam' => \$bam, + 'gzip' => \$gzip, + 'pbat' => \$pbat, + 'prefix=s' => \$prefix, + 'old_flag' => \$old_flag, + 'B|basename=s' => \$basename, + 'sam' => \$sam, + 'multicore=i' => \$multicore, + 'rg_tag' => \$rg_tag, + 'rg_id=s' => \$rg_id, + 'rg_sample=s' => \$rg_sample, + 'se|single_end=s' => \$singles, + 'ambig_bam' => \$ambig_bam, + 'cram' => \$cram, + 'cram_ref=s' => \$cram_ref, + 'nucleotide_coverage' => \$nucleotide_coverage, + 'dovetail' => \$dovetail, + ); + + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + if ($version){ + print << "VERSION"; + + + Bismark - Bisulfite Mapper and Methylation Caller. + + Bismark Version: $bismark_version + Copyright 2010-15 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/ + + +VERSION + exit; + } + + + ########################## + ### PROCESSING OPTIONS ### + ########################## + + if ($bowtie1){ + $bowtie2 = 0; + } + else{ # Bowtie 2 is now the default mode (as of 27 July 2015) + $bowtie2 = 1; + } + + unless ($sam_no_hd){ + $sam_no_hd =0; + } + + ### PATH TO BOWTIE + ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH + if ($path_to_bowtie){ + unless ($path_to_bowtie =~ /\/$/){ + $path_to_bowtie =~ s/$/\//; + } + if (-d $path_to_bowtie){ + if ($bowtie2){ + $path_to_bowtie = "${path_to_bowtie}bowtie2"; + } + else{ + $path_to_bowtie = "${path_to_bowtie}bowtie"; + } + } + else{ + die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n"; + } + } + else{ + if ($bowtie2){ + $path_to_bowtie = 'bowtie2'; + warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; } + else{ + $path_to_bowtie = 'bowtie'; + warn "Path to Bowtie specified as: $path_to_bowtie\n"; + } + } + + + if ($sam){ + warn "Output format manually set as SAM\n"; + } + elsif($cram){ + warn "Output format set to CRAM\n"; + if (defined $cram_ref){ + warn "CRAM reference given as: '$cram_ref'\n\n"; + unless (-e $cram_ref){ + die "There is a problem with the CRAM reference '$cram_ref': $!\n\n"; + } + + # determining full path information for the cram reference + if ($cram_ref =~/\//){ + if (chdir $cram_ref){ + my $absolute_cram_ref_folder = getcwd; ## making the genome folder path absolute + unless ($absolute_cram_ref_folder =~/\/$/){ + $absolute_cram_ref_folder =~ s/$/\//; + } + } + } + } + else{ + warn "CRAM reference not specified explicitely, regenerating from FastA reference files\n\n"; + } + } + else{ + $bam = 1; + warn "Output format is BAM (default)\n"; + } + + ### OUTPUT REQUESTED AS BAM FILE (default) + if ($bam or $cram){ + if ($vanilla){ + die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n"; + } + + ### PATH TO SAMTOOLS + if (defined $samtools_path){ + # if Samtools was specified as full command + if ($samtools_path =~ /samtools$/){ + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + else{ + unless ($samtools_path =~ /\/$/){ + $samtools_path =~ s/$/\//; + } + $samtools_path .= 'samtools'; + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + + warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n"; + $bam = 1; + } + # Check whether Samtools is in the PATH if no path was supplied by the user + else{ + if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH + $samtools_path = `which samtools`; + chomp $samtools_path; + warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n"; sleep(1); + $bam = 1; + } + } + + unless (defined $samtools_path){ + $bam = 2; + warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n"; + } + sleep (1); + } + + ### OPTION AMBIGUOUS BAM + if ($ambig_bam){ + unless ($bowtie2){ + die "The option --ambig_bam is only available for Bowtie2 alignments\n"; + } + } + + + #################################### + ### PROCESSING ARGUMENTS + + ### GENOME FOLDER + if (defined $genome_folder){ # 25 11 2015 The genome folder may now also be defined with the option --genome_folder + # warn "Genome folder specified with --genome_folder $genome_folder\n"; + } + else{ + $genome_folder = shift @ARGV; # mandatory + } + + unless ($genome_folder){ + warn "Genome folder was not specified!\n"; + print_helpfile(); + exit; + } + + ### checking that the genome folder, all subfolders and the required bowtie index files exist + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + + if (chdir $genome_folder){ + my $absolute_genome_folder = getcwd; ## making the genome folder path absolute + unless ($absolute_genome_folder =~/\/$/){ + $absolute_genome_folder =~ s/$/\//; + } + warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n"; + $genome_folder = $absolute_genome_folder; + } + else{ + die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] {-1 -2 | } [] (--help for more details)\n"; + } + + my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/"; + my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/"; + + my $bt2_small_index_present = 1; + my $bt2_large_index_present = 1; + + my $bt_small_index_present = 1; + my $bt_large_index_present = 1; + + if ($bowtie2){ ### Bowtie 2 + + ### Checking for small indixes first (ending in .bt2) + + # checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + + my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + warn "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n"; + $bt2_small_index_present = 0; + } + } + # checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); + + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + warn "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n"; + $bt2_small_index_present = 0; + } + } + + ### Using the small index preferentially + if ($bt2_small_index_present){ + $bt2_large_index_present = 0; + } + else{ # only checking for large indexes if the 'normal' one can't be found + warn "\nCouldn't find a traditional small Bowtie 2 index for the genome specified (ending in .bt2). Now searching for a large index instead (64-bit index ending in .bt2l)...\n"; + + ### If no small small indexes were found we look for large indexes (64-bit indexes, ending in .bt2l) + + # checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + + @CT_bowtie_index = ('BS_CT.1.bt2l','BS_CT.2.bt2l','BS_CT.3.bt2l','BS_CT.4.bt2l','BS_CT.rev.1.bt2l','BS_CT.rev.2.bt2l'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + die "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n"; + $bt2_large_index_present = 0; } + } + + ### checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + @GA_bowtie_index = ('BS_GA.1.bt2l','BS_GA.2.bt2l','BS_GA.3.bt2l','BS_GA.4.bt2l','BS_GA.rev.1.bt2l','BS_GA.rev.2.bt2l'); + + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + die "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n"; + $bt2_large_index_present = 0; + } + } + + if ($bt2_large_index_present){ + warn "64-bit large genome Bowtie 2 index found...\n"; + } + else{ + die "Failed to detect either a standard (.bt2) or 64-bit (.bt2l) Bowtie 2 index for the genome specified. Please run the bismark_genome_preparation before launching Bismark\n\n"; + } + } + + } + + else{ ### Bowtie 1 + ### Checking for small indixes first (ending in .ebwt) + ### checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + warn "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation --bowtie1 before running Bismark.\n"; + $bt_small_index_present = 0; + } + } + + ### checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + warn "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation --bowtie1 before running Bismark.\n"; + $bt_small_index_present = 0; + } + } + + ### Using the small index preferentially + if ($bt_small_index_present){ + $bt_large_index_present = 0; + } + else{ # only checking for large indexes if the 'normal' one can't be found + warn "\nCouldn't find a traditional small Bowtie index for the genome specified (ending in .ebwt). Now searching for a large index instead (64-bit index ending in .ebwtl)...\n"; + + ### If no small small indexes were found we look for large indexes (64-bit indexes, ending in .ebwtl) + + ### checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + my @CT_bowtie_index = ('BS_CT.1.ebwtl','BS_CT.2.ebwtl','BS_CT.3.ebwtl','BS_CT.4.ebwtl','BS_CT.rev.1.ebwtl','BS_CT.rev.2.ebwtl'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + warn "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation --bowtie1 before running Bismark.\n"; + $bt_large_index_present = 0; + } + } + + ### checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + my @GA_bowtie_index = ('BS_GA.1.ebwtl','BS_GA.2.ebwtl','BS_GA.3.ebwtl','BS_GA.4.ebwtl','BS_GA.rev.1.ebwtl','BS_GA.rev.2.ebwtl'); + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + warn "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation --bowtie1 before running Bismark.\n"; + $bt_large_index_present = 0; + } + } + + if ($bt_large_index_present){ + warn "64-bit large genome Bowtie index found...\n"; + } + else{ + die "Failed to detect either a standard (.ebwt) or 64-bit (.ebwtl) Bowtie index for the genome specified. Please run the bismark_genome_preparation --bowtie1 before launching Bismark\n\n"; + } + } + + } + + my $CT_index_basename = "${CT_dir}BS_CT"; + my $GA_index_basename = "${GA_dir}BS_GA"; + + ### INPUT OPTIONS + + ### SEQUENCE FILE FORMAT + ### exits if both fastA and FastQ were specified + if ($fasta and $fastq){ + die "Only one sequence filetype can be specified (fastA or fastQ)\n"; + } + + ### unless fastA is specified explicitely, fastQ sequence format is expected by default + if ($fasta){ + print "FastA format specified\n"; + $sequence_format = 'FASTA'; + push @bowtie_options, '-f'; + } + elsif ($fastq){ + print "FastQ format specified\n"; + $sequence_format = 'FASTQ'; + push @bowtie_options, '-q'; + } + else{ + $fastq = 1; + print "FastQ format assumed (by default)\n"; + $sequence_format = 'FASTQ'; + push @bowtie_options, '-q'; + } + + ### SKIP + if ($skip){ + warn "Skipping the first $skip reads from the input file\n"; + # push @bowtie_options,"-s $skip"; + } + + ### UPTO + if ($qupto){ + warn "Processing sequences up to read no. $qupto from the input file\n"; + if ($bowtie2){ + # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2 + } + else{ + # push @bowtie_options,"--qupto $qupto"; + } + } + + ### QUALITY VALUES + if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){ + die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)"; + } + if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2. + # Phred quality values work only when -q is specified + unless ($fastq){ + die "Phred quality values works only when -q (FASTQ) is specified\n"; + } + if ($bowtie2){ + push @bowtie_options,"--phred33"; + } + else{ + push @bowtie_options,"--phred33-quals"; + } + } + if ($phred64){ + # Phred quality values work only when -q is specified + unless ($fastq){ + die "Phred quality values work only when -q (FASTQ) is specified\n"; + } + if ($bowtie2){ + push @bowtie_options,"--phred64"; + } + else{ + push @bowtie_options,"--phred64-quals"; + } + } + else{ + $phred64 = 0; + } + + if ($solexa){ + if ($bowtie2){ + die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n"; + } + # Solexa to Phred value conversion works only when -q is specified + unless ($fastq){ + die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n"; + } + push @bowtie_options,"--solexa-quals"; + } + else{ + $solexa = 0; + } + + ### ALIGNMENT OPTIONS + + ### MISMATCHES + if (defined $mismatches){ + if ($bowtie2){ + if ($mismatches == 0 or $mismatches == 1){ + push @bowtie_options,"-N $mismatches"; + } + else{ + die "Please set the number of multiseed mismatches for Bowtie 2 with '-N ' (where can be 0 or 1)\n"; + } + } + else{ + if ($mismatches >= 0 and $mismatches <= 3){ + push @bowtie_options,"-n $mismatches"; + } + else{ + die "Please set the number of seed mismatches for Bowtie 1 with '-n ' (where can be 0,1,2 or 3)\n"; + } + } + } + else{ + unless ($bowtie2){ + push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2 + } + } + + ### SEED LENGTH + if (defined $seed_length){ + if ($bowtie2){ + push @bowtie_options,"-L $seed_length"; + } + else{ + push @bowtie_options,"-l $seed_length"; + } + } + + ### MISMATCH CEILING + if (defined $ceiling){ + die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2); + push @bowtie_options,"-e $ceiling"; + } + + + ### BOWTIE 2 EFFORT OPTIONS + + ### CONSECUTIVE SEED EXTENSION FAILS + if (defined $seed_extension_fails){ + die "The option '-D ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + push @bowtie_options,"-D $seed_extension_fails"; + } + + ### RE-SEEDING REPETITIVE SEEDS + if (defined $reseed_repetitive_seeds){ + die "The option '-R ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + push @bowtie_options,"-R $reseed_repetitive_seeds"; + } + + + ### BOWTIE 2 SCORING OPTIONS + + my ($score_min_intercept, $score_min_slope); + + if ($score_min){ + die "The option '--score_min ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + + unless ($score_min =~ /^L,(.+),(.+)$/){ + die "The option '--score_min ' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + ($score_min_intercept, $score_min_slope) = ($1, $2); + push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2 + } + else{ + if ($bowtie2){ + ($score_min_intercept, $score_min_slope) = (0, -0.2); + push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2 + } + } + + ### BOWTIE 2 READ GAP OPTIONS + my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend); + + if ($rdg){ + die "The option '--rdg ,' is only available when using Bowtie 2\n\n" unless ($bowtie2); + if ($rdg =~ /^(\d+),(\d+)$/){ + $deletion_open = $1; + $deletion_extend = $2; + } + else{ + die "The option '--rdg ,' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + push @bowtie_options,"--rdg $rdg"; + } + else{ + $deletion_open = 5; + $deletion_extend = 3; + } + + ### BOWTIE 2 REFERENCE GAP OPTIONS + if ($rfg){ + die "The option '--rfg ,' is only available when using Bowtie 2\n\n" unless ($bowtie2); + if ($rfg =~ /^(\d+),(\d+)$/){ + $insertion_open = $1; + $insertion_extend = $2; + } + else{ + die "The option '--rfg ,' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + push @bowtie_options,"--rfg $rfg"; + } + else{ + $insertion_open = 5; + $insertion_extend = 3; + } + + + ### BOWTIE 2 PARALLELIZATION OPTIONS + if (defined $parallel){ + die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2); + } + if ($bowtie2){ + if ($parallel){ + die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); + if ($parallel > 4){ + warn "Attention: using more than 4 cores per alignment thread has been reported to have diminishing returns. If possible try to limit -p to a value of 4\n"; sleep(2); + } + push @bowtie_options,"-p $parallel"; + push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work. + print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n"; + sleep (2); + } + } + + ### REPORTING OPTIONS + + if ($bowtie2){ + push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default + + ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while + if(defined $most_valid_alignments){ + + warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n"; + } + } + else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1 + push @bowtie_options,'-k 2'; + } + + ### --BEST + if ($bowtie2){ + if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used + die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n"; + } + } + else{ + # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process) + unless ($best){ + push @bowtie_options,'--best'; + } + } + + ### VANILLA BISMARK (BOWTIE 1) OUTPUT + if ($vanilla){ + if ($bowtie2){ + die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n"; + } + } + else{ + $vanilla = 0; + } + + ### PAIRED-END MAPPING + if ($mates1){ + + if (defined $singles){ # if --single_end has been set explicitely + die "You cannot set --single_end and supply files in paired-end format (-1 -2 ). Please respecify!\n"; + } + + my @mates1 = (split (/,/,$mates1)); + die "Paired-end mapping requires the format: -1 -2 , please respecify!\n" unless ($mates2); + my @mates2 = (split(/,/,$mates2)); + unless (scalar @mates1 == scalar @mates2){ + die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 -2 )\n"; + } + while (1){ + my $mate1 = shift @mates1; + my $mate2 = shift @mates2; + last unless ($mate1 and $mate2); + push @filenames,"$mate1,$mate2"; + } + if ($bowtie2){ + push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments + push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones + + if ($pbat){ + $dovetail = 1; # setting the option $dovetail for PBAT paired-end alignments + } + + if ($dovetail){ + if ($old_flag){ + die "The option --dovetail may only be specified with the current SAM FLAG values. Please respecify...\n"; + } + push @bowtie_options,'--dovetail'; ## 07 03 2016 Adding the option --dovetail, mainly for PBAT alignments + } + } + + if ($old_flag){ + warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); + sleep(3); + } + } + elsif ($mates2){ + die "Paired-end mapping requires the format: -1 -2 , please respecify!\n"; + } + + ### SINGLE-END MAPPING + # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified + + unless ($mates1 and $mates2){ + if (defined $singles){ # if --single_end has been set explicitely + warn "Mapping set to single-end mode (user defined). File names need to be separated by commas [,] or colons [:]! Supplied file names are: $singles\n"; + $singles =~ s/:/,/g; # replacing colons (:) with commas + } + else{ + $singles = join (',',@ARGV); + unless ($singles){ + die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n"; + } + $singles =~ s/\s/,/g; # replacing spaces with commas + } + + @filenames = (split(/,/,$singles)); + warn "\nFiles to be analysed:\n"; + warn "@filenames\n\n"; + sleep (3); + } + + ### MININUM INSERT SIZE (PAIRED-END ONLY) + if (defined $minins){ + die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles); + push @bowtie_options,"--minins $minins"; + } + + ### MAXIMUM INSERT SIZE (PAIRED-END ONLY) + if (defined $maxins){ + die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles); + push @bowtie_options,"--maxins $maxins"; + } + else{ + unless ($singles){ + push @bowtie_options,'--maxins 500'; + } + } + + ### QUIET prints nothing besides alignments (suppresses warnings) + if ($quiet){ + push @bowtie_options,'--quiet'; + } + + ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments + unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option + if (defined $chunk){ + push @bowtie_options,"--chunkmbs $chunk"; + } + else{ + push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default) + } + } + + + ### SUMMARY OF ALL BOWTIE OPTIONS + my $bowtie_options = join (' ',@bowtie_options); + + + ### STRAND-SPECIFIC LIBRARIES + my $directional; + if ($non_directional){ + die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat); + warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n"; + sleep (1); + $directional = 0; + } + elsif($pbat){ + die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip); + die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta); + + warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n"; + sleep (1); + $directional = 0; + } + else{ + warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n"; + sleep (1); + $directional = 1; # default behaviour + } + + ### UNMAPPEDSEQUENCE OUTPUT + $unmapped = 0 unless ($unmapped); + + ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT + $multi_map = 0 unless ($multi_map); + + + ### OUTPUT DIRECTORY + + chdir $parent_dir or die "Failed to move back to current working directory\n"; + if ($output_dir){ + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + + if (chdir $output_dir){ + $output_dir = getcwd; # making the path absolute + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + else{ + mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; + warn "Created output directory $output_dir!\n\n"; + chdir $output_dir or die "Failed to move to $output_dir\n"; + $output_dir = getcwd; # making the path absolute + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + warn "Output will be written into the directory: $output_dir\n"; + } + else{ + $output_dir = ''; + } + + ### TEMPORARY DIRECTORY for C->T and G->A transcribed files + + chdir $parent_dir or die "Failed to move back to current working directory\n"; + if ($temp_dir){ + warn "\nUsing temp directory: $temp_dir\n"; + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + + if (chdir $temp_dir){ + $temp_dir = getcwd; # making the path absolute + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + } + else{ + mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n"; + warn "Created temporary directory $temp_dir!\n\n"; + chdir $temp_dir or die "Failed to move to $temp_dir\n"; + $temp_dir = getcwd; # making the path absolute + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + } + warn "Temporary files will be written into the directory: $temp_dir\n"; + } + else{ + $temp_dir = ''; + } + + ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE + if ($non_bs_mm){ + if ($vanilla){ + die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; + } + } + + ### PREFIX FOR OUTPUT FILES + if ($prefix){ + # removing trailing dots + + $prefix =~ s/\.+$//; + + warn "Using the following prefix for output files: $prefix\n\n"; + sleep(1); + } + + if (defined $multicore){ + unless ($multicore > 0){ + die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n"; + } + if ($multicore > 20){ + warn "Core usage currently set to more than 20 threads. This might fail horribly but let's see how it goes... (set value: $multicore)\n\n"; + } + if ($sam){ + die "The multicore function currently requires the output to be in BAM format, so please lose either option --sam or --multi\n"; + } + } + else{ + $multicore = 1; # default. Single-thread mode + warn "Setting parallelization to single-threaded (default)\n\n"; + } + + if ($basename and $multicore > 1){ + die "Specifying --basename in conjuction with --multicore is currently not supported (but we are aiming to fix this soon). Please lose either --basename or --multicore to proceed\n\n"; + } + + # Read Group Tags for the @RG header + if (defined $rg_sample){ + if (defined $rg_id){ + warn "--rg_id set to '$rg_id', setting --rg_tag to TRUE\n"; + $rg_tag++; # implicitely setting $rg_tag as well + } + else{ + die "--rg_sample cannot be specified without without setting --rg_id. Please set both or none (which would result in the default name 'SAMPLE' for both)\n"; + } + } + + if ($rg_tag){ # either true because of --rg_tag, or because --rg_id/--rg_sample were defined as well + unless (defined $rg_id){ + $rg_id = 'SAMPLE'; + } + unless (defined $rg_sample){ + $rg_sample = 'SAMPLE'; + } + } + + return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index_present,$multicore,$rg_tag,$rg_id,$rg_sample,$ambig_bam,$cram,$cram_ref,$nucleotide_coverage,$dovetail); +} + + + +sub generate_SAM_header{ + + print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order + if ($ambig_bam){ + print AMBIBAM "\@HD\tVN:1.0\tSO:unsorted\n"; + } + + # Unordered printing of @SQ headers + # foreach my $chr (keys %chromosomes){ + # my $length = length ($chromosomes{$chr}); + # print "\@SQ\tSN:$chr\tLN:$length\n"; + # print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length + # } + + foreach my $chr (sort {$a<=>$b} keys %SQ_order){ + # warn "$chr\t$SQ_order{$chr}\n"; + my $length = length ($chromosomes{$SQ_order{$chr}}); + print OUT "\@SQ\tSN:$SQ_order{$chr}\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length + if ($ambig_bam){ + print AMBIBAM "\@SQ\tSN:$SQ_order{$chr}\tLN:$length\n"; + } + } + + # 18 11 2015: Added @RG as a header line if --rg_tag or --rg_id/--rg_sample were set as well + if ($rg_tag){ + print OUT "\@RG\tPL:ILLUMINA\tID:$rg_id\tSM:$rg_sample\n"; # @RG = Read Group, PL = Platform, ID: required, SM: sample, can be a description + } + + print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version + if ($ambig_bam){ + print AMBIBAM "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; + } +} + +### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format: +### O. Tam (2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011), A. Dei Rossi (2014) + +sub single_end_SAM_output{ + + my ($id,$actual_seq,$methylation_call_params,$qual) = @_; + my $strand = $methylation_call_params->{$id}->{alignment_strand}; + my $chr = $methylation_call_params->{$id}->{chromosome}; + my $start = $methylation_call_params->{$id}->{position}; + my $stop = $methylation_call_params->{$id}->{end_position}; + my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence}; + my $methcall = $methylation_call_params->{$id}->{methylation_call}; + my $read_conversion = $methylation_call_params->{$id}->{read_conversion}; + my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; + my $number_of_mismatches; + + if ($bowtie2){ + $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score}; + } + else{ + $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches}; + } + + ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" + ## FLAG: bitwise FLAG. Each bit is explained in the following table: + ## Bit Description Comment Value + ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1) + ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2) + ## 0x4 segment unmapped --- --- + ## 0x8 next segment in the template unmapped --- --- + ## 0x10 SEQ being reverse complemented value: 2**4 ( 16) + ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32) + ## 0x40 the first segment in the template read 1 value: 2**6 ( 64) + ## 0x80 the last segment in the template read 2 value: 2**7 (128) + ## 0x100 secondary alignment --- --- + ## 0x200 not passing quality controls --- --- + ## 0x400 PCR or optical duplicate --- --- + + ##### + + my $flag; # FLAG variable used for SAM format. + if ($strand eq "+"){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ + $flag = 0; # 0 for "+" strand (OT) + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ + $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand) + } + else{ + die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; + } + } + elsif ($strand eq "-"){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ + $flag = 16; # 16 for "-" strand (OB) + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ + $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand) + } + else{ + die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; + } + } + else{ + die "Unexpected strand information: $strand\n\n"; + } + + ##### + + my $mapq; + + if ($bowtie2){ + $mapq = $methylation_call_params->{$id}->{mapq}; + } + else{ + $mapq = 255; # Mapping quality is unavailable for use with Bowtie + } + + ##### + + my $cigar; + if ($bowtie2){ + $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2 + } + else{ + $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches) + } + + ##### + + my $rnext = "*"; # Paired-end variable + + ##### + + my $pnext = 0; # Paired-end variable + + ##### + + my $tlen = 0; # Paired-end variable + + ##### + + if ($read_conversion eq 'CT'){ + $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands + } + else{ + $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries + } + + if ($strand eq '-'){ + $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand + $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence + if ($cigar =~ /D/){ + $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} ); + } + $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well + } + + ##### + + my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string + # into the reference string. hemming_dist() + if ($bowtie2){ + $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + } + + my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences + + ##### + + my $MD_tag = make_mismatch_string($actual_seq, $ref_seq,$cigar,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag}); # Optional tag MD: string providing mismatched reference bases in the alignment (this does include indel information) + # my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!) + + ##### + + my $XM_tag; # Optional tag XM: Methylation Call String + if ($strand eq '+'){ + $XM_tag = "XM:Z:$methcall"; + } + elsif ($strand eq '-'){ + $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well + } + + ##### + + my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion + + ##### + + my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion + + ##### + + # Optionally calculating number of mismatches for Bowtie 2 alignments + + if ($non_bs_mm) { + if ($bowtie2) { + + $number_of_mismatches =~ s/-//; # removing the minus sign + + ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches + if ($cigar =~ /(D|I)/) { + # warn "$cigar\n"; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach (0..$#len) { + if ($ops[$_] eq 'M') { + # warn "skipping\n"; + next; # irrelevant + } + elsif ($ops[$_] eq 'I') { # insertion in the read sequence + $number_of_mismatches -= $insertion_open; + $number_of_mismatches -= $len[$_] * $insertion_extend; + # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; + } + elsif ($ops[$_] eq 'D') { # deletion in the read sequence + $number_of_mismatches -= $deletion_open; + $number_of_mismatches -= $len[$_] * $deletion_extend; + # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; + } + elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + else { + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + } + # warn "Alignment score $number_of_mismatches\n"; + # print "Mismatches $number_of_mismatches\n\n"; + } + ### Now we have InDel corrected alignment scores + + ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the + ### sequence contained more than 5 Ns, but this should occur close to never + + my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division + # warn "N count: $seq_N_count\n"; + $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count; + # warn "MM $number_of_mismatches\n"; + } + } + + #### + + my $XA_tag = "XA:Z:$number_of_mismatches"; + + #### + + my $read_group; # optional + if ($rg_tag){ + $read_group = "RG:Z:$rg_id"; + } + + #### + + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + ### optionally print number of non-bisulfite mismatches + if ($non_bs_mm){ + if ($rg_tag){ + print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag,$read_group)),"\n"; + } + else{ + print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n"; + } + } + else{ # default + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + if ($rg_tag){ + print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag,$read_group)),"\n"; + } + else{ + print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag)),"\n"; + } + } +} + +sub paired_end_SAM_output{ + + my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_; + my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand + my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2}; + my $chr = $methylation_call_params->{$id}->{chromosome}; + my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1}; + my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2}; + my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1}; + my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; + my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; + my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; + my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; + + my $id_1; + my $id_2; + + if ($old_flag){ + $id_1 = $id.'/1'; + $id_2 = $id.'/2'; + } + else{ + $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard + $id_2 = $id; + } + + # Allows all degenerate nucleotide sequences in reference genome + # die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHVX]/i; # X are padded nucleotides in case of insertions in the read + # die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHVX]/i; + + my $index; # used to store the srand origin of the alignment in a less convoluted way + + if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){ + $index = 0; ## this is OT (original top strand) + } + elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){ + $index = 1; ## this is CTOB (complementary to OB) + } + elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){ + $index = 2; ## this is CTOT (complementary to OT) + } + elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){ + $index = 3; ## this is OB (original bottom) + } + else { + die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n"; + } + + my $number_of_mismatches_1; + my $number_of_mismatches_2; + + if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine + $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default! + $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2}; + } + else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation + if ($index == 2 or $index == 3){ # CTOT or OB + $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default! + $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1}; + } + else{ # if the first read aligned in forward direction it is like for Bowtie 2 + $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default! + $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2}; + } + } + + + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the + ### first or last position. + + if ($index == 0 or $index == 3){ # OT or OB + $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2); + $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2); + } + else{ # CTOT or CTOB + $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2); + $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2); + } + + ##### + + my $start_read_1; + my $start_read_2; + # adjusting end positions + + if ($bowtie2){ + $start_read_1 = $methylation_call_params->{$id}->{position_1}; + $start_read_2 = $methylation_call_params->{$id}->{position_2}; + } + else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 + if ($strand_1 eq '+'){ # Read 1 aligns to the + strand + $start_read_1 = $methylation_call_params->{$id}->{start_seq_1}; + $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1; + } + else{ # read 1 is on the - strand + $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1; + $start_read_2 = $methylation_call_params->{$id}->{start_seq_1}; + } + } + + ##### + + my $end_read_1; + my $end_read_2; + # adjusting end positions + + if ($bowtie2){ + $end_read_1 = $methylation_call_params->{$id}->{end_position_1}; + $end_read_2 = $methylation_call_params->{$id}->{end_position_2}; + } + else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 + if ($strand_1 eq '+'){ # Read 1 aligns to the + strand + $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1; + $end_read_2 = $methylation_call_params->{$id}->{alignment_end}; + } + else{ + $end_read_1 = $methylation_call_params->{$id}->{alignment_end}; + $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1; + } + } + + ##### + + ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" + ## FLAG: bitwise FLAG. Each bit is explained in the following table: + ## Bit Description Comment Value + ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1) + ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2) + ## 0x4 segment unmapped --- --- + ## 0x8 next segment in the template unmapped --- --- + ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16) + ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32) + ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64) + ## 0x80 the last segment in the template read 2 value: 2^^7 (128) + ## 0x100 secondary alignment --- --- + ## 0x200 not passing quality controls --- --- + ## 0x400 PCR or optical duplicate --- --- + + ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account + + # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) + # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) + + my $flag_1; # FLAG variable used for SAM format + my $flag_2; + + ### The new default FLAG values were changed on 21 07 2015, so that reads do not ignored as discordant reads by the new SeqMonk BAM import + ### In essence we are going to flip the R1 R2 flags around for CTOT and CTOB reads. We still report the first and second read in the same + ### order and only change the actual FLAG value. This should not affect the methylation extraction in any way + + if ($index == 0){ # OT + unless ($old_flag){ + $flag_1 = 99; # Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) + $flag_2 = 147; # Read 2 is reverse complemented but informative for the OT (1+2+16+128) + } + else{ + $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) + $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) + } + } + elsif ($index == 1){ # CTOB + unless($old_flag){ + $flag_1 = 163; # Read 1 is on the forward strand (CTOB) and Read 2 is reverse complemented but we swap round the FLAG + # for R1 and R2 so that we don't end up with discordant pairs + # So Read 1 gets Paired read, mapped in proper pair, mate is reversed and second in pair (1+2+32+128) + $flag_2 = 83; # Read 2 gets Read paired, mapped in proper pair, first in pair and Read 2 is reversed (1+2+16+64) + } + else{ + $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) + } + } + elsif ($index == 2){ # CTOT + unless ($old_flag){ + $flag_1 = 147; # Read 1 is reverse complemented (CTOT) and Read 2 is the forward read + # but we swap round the FLAG for R1 and R2 so that we do not end up with discordant pairs + # So Read 1 gets Read paired, read mapped in proper pair, read reverse complemented and second in pair (1+2+32+128) + $flag_2 = 99; # Read 2 gets Read paired, read mapped in proper pair, mate reverse strand and First in Pair (1+2+32+64) + } + else{ + $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) + $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) + } + } + elsif ($index == 3){ # OB + unless ($old_flag){ + $flag_1 = 83; # Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) + $flag_2 = 163; # Read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) + } + else{ + $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) + } + } + + ##### + + my $mapq; + + if ($bowtie2){ + $mapq = $methylation_call_params->{$id}->{mapq}; + } + else{ + $mapq = 255; # Mapping quality is unavailable for use with Bowtie + } + + ##### + + my $cigar_1; + my $cigar_2; + + if ($bowtie2){ + $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2 + $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2}; + } + else{ + $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches) + $cigar_2 = length($actual_seq_2) . "M"; + } + + ##### + + my $rnext = '='; # Chromosome of mate; applies to both reads + + ##### + + my $pnext_1 = $start_read_2; # Leftmost position of mate + my $pnext_2 = $start_read_1; + + ##### + + my $tlen_1; # signed observed Template LENgth (or inferred fragment size) + my $tlen_2; + + if ($bowtie2){ + + if ($start_read_1 <= $start_read_2){ + + # Read 1 alignment is leftmost + + if ($end_read_2 >= $end_read_1){ + + if ($flag_1 == 83 and $dovetail){ # R1 has a reverse orientation + # -----------------> read 2 reads are dovetailing, that is one mate alignment extends past the beginning of the other + # <------------------- read 1 such that the wrong mate begins upstream + # warn "FLAG 1: $flag_1\nFLAG 2: $flag_2\n"; + # warn "Reads are dovetailing\n"; + $tlen_1 = $start_read_1 - $end_read_2 - 1; # Read 1 still receives a - sign even though it is the leftmost one + $tlen_2 = $end_read_2 - $start_read_1 + 1; # Read 2 receives a + sign, + # warn "TLEN 1: $tlen_1\nTLEN 2: $tlen_2\n"; + } + else{ + # -------------> read 1 reads not overlapping + # <---------- read 2 + # or + # -------------------> read 1 reads overlapping + # <------------------- read 2 + # or + # -------------------------> read 1 + # <----------------------- read 2 read 2 contained within read 1 + # or + # -------------------------> read 1 reads 1 and 2 exactly overlapping + # <------------------------- read 2 + # + + $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, + $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign + # warn "Reads are non/overlapping\nTLEN 1: $tlen_1\nTLEN 2: $tlen_2\n"; + } + } + elsif ($end_read_2 < $end_read_1){ + + # -------------------------> read 1 + # <----------- read 2 read 2 contained within read 1 + # + # or + # + # -------------------------> read 1 + # <------------------------ read 2 read 2 contained within read 1 + + # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable + $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, + $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this + ### as a request by frozenlyse on SeqAnswers on 24 July 2013 + } + + } + + elsif ($start_read_2 < $start_read_1){ + + # Read 2 alignment is leftmost + + if ($end_read_1 >= $end_read_2){ + + # Read 2 alignment is leftmost + if ($flag_1 == 99 and $dovetail){ # R1 has a forward orientation + + # -----------------> read 1 reads are dovetailing, that is one mate alignment extends past the beginning of the other + # <------------------- read 2 such that the wrong mate begins upstream + + # warn "FLAG 1: $flag_1\nFLAG 2: $flag_2\n"; + # warn "Reads are dovetailing\n"; + $tlen_1 = $end_read_1 - $start_read_2 + 1; # Read 1 still receives a + sign even though it is not leftmost + $tlen_2 = $start_read_2 - $end_read_1 - 1; + # warn "TLEN 1: $tlen_1\nTLEN 2: $tlen_2\n"; + } + else{ + # -------------> read 2 reads not overlapping + # <---------- read 1 + # or + # -------------------------> read 2 reads overlapping + # <------------------------- read 1 + # or + # -------------------------> read 2 + # <----------------------- read 1 read 1 contained within read 2 + # or + # -------------------------> read 2 + # <----------------------- read 1 read 1 contained within read 2 + # warn "FLAG 1: $flag_1\nFLAG 2: $flag_2\n"; + # warn "Read 2 has a forward orientation\n"; + $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, + $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign + } + } + elsif ($end_read_1 < $end_read_2){ + + # -------------------------> read 2 + # <----------- read 1 read 1 contained within read 2 + # + # or + # + # -------------------------> read 2 + # <------------------------ read 1 read 1 contained within read 2 + + # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable + $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, + $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this + ### as a request by frozenlyse on SeqAnswers on 24 July 2013 + } + } + } + + else{ # Bowtie 1 + + if ($end_read_2 >= $end_read_1){ + # Read 1 alignment is leftmost + # -------------------------> read 1 + # <------------------------- read 2 + # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing + + $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, + $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign + } + else{ + # Read 2 alignment is leftmost + # -------------------------> read 2 + # <------------------------- read 1 + # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing + + $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, + $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign + } + } + + ##### + + # adjusting the strand of the sequence before we use them to generate mismatch strings + if ($strand_1 eq '-'){ + $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand + $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence + if ($cigar_1 =~ /D/){ + $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} ); + } + $qual_1 = reverse $qual_1; # we need to reverse the quality string as well + } + if ($strand_2 eq '-'){ + $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand + $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence + if ($cigar_2 =~ /D/){ + $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} ); + } + $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well + } + + # print "$actual_seq_1\n$ref_seq_1\n\n"; + # print "$actual_seq_2\n$ref_seq_2\n\n"; + + ##### + + my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence + my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2); + if ($bowtie2){ + $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + } + my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences + my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences + + ##### + + my $MD_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1,$cigar_1,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1}); # Optional tag MD: String providing mismatched reference bases in the alignment (including indel information) + my $MD_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2,$cigar_2,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2}); + + # my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!) + # my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2); + + ##### + + my $XM_tag_1; # Optional tag XM: Methylation call string + my $XM_tag_2; + + if ($strand_1 eq '-'){ + $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented + } + else{ + $XM_tag_1 = "XM:Z:$methcall_1"; + } + + if ($strand_2 eq '-'){ + $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented + } + else{ + $XM_tag_2 = "XM:Z:$methcall_2"; + } + + ##### + + my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state + my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state + + ##### + + my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads + + ##### + + # Optionally calculating number of mismatches for Bowtie 2 alignments + + if ($non_bs_mm) { + if ($bowtie2) { + + $number_of_mismatches_1 =~ s/-//; # removing the minus sign + $number_of_mismatches_2 =~ s/-//; + + ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches + + ### CIGAR 1 + if ($cigar_1 =~ /(D|I)/) { + # warn "$cigar_1\n"; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar_1); # storing the length per operation + my @ops = split (/\d+/,$cigar_1); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach (0..$#len) { + if ($ops[$_] eq 'M') { + # warn "skipping\n"; + next; # irrelevant + } + elsif ($ops[$_] eq 'I') { # insertion in the read sequence + $number_of_mismatches_1 -= $insertion_open; + $number_of_mismatches_1 -= $len[$_] * $insertion_extend; + # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; + } + elsif ($ops[$_] eq 'D') { # deletion in the read sequence + $number_of_mismatches_1 -= $deletion_open; + $number_of_mismatches_1 -= $len[$_] * $deletion_extend; + # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; + } + elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + else { + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + } + + # warn "Alignment score $number_of_mismatches_1\n"; + # print "Mismatches $number_of_mismatches_1\n\n"; + } + + ### CIGAR 2 + if ($cigar_2 =~ /(D|I)/) { + # warn "$cigar_2\n"; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar_2); # storing the length per operation + my @ops = split (/\d+/,$cigar_2); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach (0..$#len) { + if ($ops[$_] eq 'M') { + # warn "skipping\n"; + next; #irrelevant + } + elsif ($ops[$_] eq 'I') { # insertion in the read sequence + $number_of_mismatches_2 -= $insertion_open; + $number_of_mismatches_2 -= $len[$_] * $insertion_extend; + # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; + } + elsif ($ops[$_] eq 'D') { # deletion in the read sequence + $number_of_mismatches_2 -= $deletion_open; + $number_of_mismatches_2 -= $len[$_] * $deletion_extend; + # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; + } + elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + else { + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + } + } + + ### Now we have InDel corrected Alignment scores + + ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the + ### sequence contained more than 5 Ns, but this should occur close to never + + my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division + my $seq_2_N_count = $number_of_mismatches_2 % 6; + # warn "N count 1: $seq_1_N_count\n"; + # warn "N count 2: $seq_2_N_count\n"; + + $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count; + $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count; + + # warn "MM1 $number_of_mismatches_1 \n"; + # warn "MM2 $number_of_mismatches_2 \n"; + } + } + + #### + + my $XA_tag = "XA:Z:$number_of_mismatches_1"; + my $XB_tag = "XB:Z:$number_of_mismatches_2"; + + #### + + my $read_group; # optional + if ($rg_tag){ + $read_group = "RG:Z:$rg_id"; + } + + #### + + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + ### optionally print number of non-bisulfite mismatches + if ($non_bs_mm){ + if ($rg_tag){ + print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag,$read_group)), "\n"; + print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag,$read_group)), "\n"; + } + else{ + print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n"; + print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n"; + } + } + else{ # default + if ($rg_tag){ + print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$read_group)), "\n"; + print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$read_group)), "\n"; + } + else{ + print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n"; + print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n"; + } + } +} + + +sub revcomp{ + my $seq = shift or die "Missing seq to reverse complement\n"; + $seq = reverse $seq; + $seq =~ tr/ACTGactg/TGACTGAC/; + return $seq; +} + +sub hemming_dist{ + my $matches = 0; + my @actual_seq = split //,(shift @_); + my @ref_seq = split //,(shift @_); + + foreach (0..$#actual_seq){ + ++$matches if ($actual_seq[$_] eq $ref_seq[$_]); + } + return my $hd = scalar @actual_seq - $matches; +} + + +### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might +### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels +### 05 June 2014 + + +sub make_mismatch_string{ + my ($actual_seq,$ref_seq,$cigar,$md_sequence) = @_; + + my $MD_tag = "MD:Z:"; + my $prev_matching = 0; + my $last_char; + + my $ref_base; + my $actual_base; + + foreach my $pos ( 0..(length$actual_seq) - 1 ){ + + $actual_base = substr($actual_seq,$pos,1); + $ref_base = substr($ref_seq,$pos,1); + # if ($verbose){ warn "reference: $ref_base\tseen base: $actual_base\n";} + + if ( $actual_base eq $ref_base ){ + ++$prev_matching; + } + else{ + # If the mismatch is due to an insertion we simply move on, else we print the previously matching bases as well as the mismatching genomic base + if ($ref_base eq 'X'){ + # if ($verbose){ warn "The genome base was an artificually padded '$ref_base' due to an insertion in the read at this position. Just ignoring it for the MD tag\n"; sleep(1);} + } + else{ + # if ($verbose){ warn "previous matching bases: $prev_matching\n";} + + ### There is a mismatch between the sequence and the genome. First we need to write out how may bases matched until now + if ($prev_matching == 0){ + # if ($verbose){ warn "Got a mismatch either at the very start or next to another mismatch. Need to add a padding 0 as well as the mismatch\n";} + # if ($verbose){ warn "${prev_matching}$ref_base\n";} + $MD_tag .= $prev_matching; + $MD_tag .= $ref_base; + } + else{ + # if ($verbose){ warn "${prev_matching}$ref_base\n";} + $MD_tag .= $prev_matching; + $MD_tag .= $ref_base; + } + + $prev_matching = 0; # resetting $prev_matching + } + + } + + } + ### appending the number of matches one last time + $MD_tag .= $prev_matching; + + + ### If the read contains deletion(s) we need to take care of these in the MD-tag as well + if ($cigar =~ /D/){ + my $deletions_total = 0; + while ($cigar =~ /D/g){ + ++$deletions_total; + } + if ($verbose){ warn "Read contains $deletions_total deletions in total\n\n";} + + if ($verbose){ warn "There was a deletion in the read!\n";} + if ($verbose){ warn "actual:\t$actual_seq\nref:\t$ref_seq\nMD-seq:\t$md_sequence\nMD-tag: $MD_tag\n";} + + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my $MD_pos_so_far = 0; + my $deletions_processed = 0; + my $del_pos = 0; + my $deleted_bases = ''; + my $new_MD = $1 if ($MD_tag =~ /MD:Z:(.*)/); + my $md_index_already_processed; + + my @md = split //,$new_MD; + + if ($verbose){ warn "New MD-tag: $new_MD\n\n";} + $MD_tag = "MD:Z:"; ### reconstituting a new MD-tag + $new_MD = ''; # using this to build up a new string that will replace the old \@md + + if ($verbose){ warn "CIGAR string; $cigar\n";} + ### determining end position of a read + foreach my $index(0..$#len){ + + if ($ops[$index] eq 'M'){ # matching bases + $del_pos += $len[$index]; + if ($verbose){ warn "Operation is 'M', adding $len[$index] bp\n";} + } + elsif($ops[$index] eq 'I'){ # insertion + $del_pos += $len[$index]; + ### need to add insertions in the read to MD pos so far! + $MD_pos_so_far += $len[$index]; + if ($verbose){ warn "Operation is 'I', adding $len[$index] bp\n";} + } + elsif($ops[$index] eq 'D'){ # deletion + if ($verbose){ warn "Operation is 'D', extracting $len[$index] bp\n";} + $deleted_bases = substr($md_sequence,$del_pos,$len[$index]); + if ($verbose){ warn "Deleted bases: $deleted_bases\n\n";} + + ### Now we need to process the MD-tag so far and write out everything up until this point, inlcuding the deletion + if ($verbose){ warn "Now processing the MD-tag\n";} + my $op; + + my $this_deletion_processed; + my $md_processed_so_far; + my $current_md_index; + + foreach my $el (@md){ + + unless (defined $current_md_index){ + $current_md_index = 0; # first element = index 0 + } + else{ + ++$current_md_index; + } + + if ($md_index_already_processed and ($current_md_index <= $md_index_already_processed)){ + if ($verbose){ warn "This has to be another deletion within the same read. Currently processing index $current_md_index, but have already processed $md_index_already_processed indexes previously\n";} + $new_MD .= $el; + next; + } + + if ($verbose){ warn "Current element: $el\n";} + unless (defined $op){ # initialize + $op = $el; + if ($verbose){ warn "Initializing \$op as $op\n";} + next; + } + + if ($deletions_processed == $deletions_total){ + if ($verbose){ warn "Processed $deletions_processed in the read so far, out of $deletions_total total. Just appending elements until the end of the string: here $el\n";} + $MD_tag .= $el; + $new_MD .= $el; + next; + } + # this only occurs when there are more deletions in the read but we want to regenerate a new MD tag + if ($this_deletion_processed){ + $new_MD .= $el; + next; + } + + if ($op =~ /^\d+$/){ + if ($verbose){ warn "Operation so far was a digit: $op\n";} + if ($el =~ /\d/){ + $op .= $el; + if ($verbose){ warn "Appending current operation $el. New operation is: $op\n";} + next; + } + else{ + if ($verbose){ warn "current element is a word character: $el\n";} + + ### Need to determine if the matching operation length includes the deletion position + if ($verbose){ warn "Processing operation $op and adding it to MD pos which is so far: $MD_pos_so_far; deletion pos is $del_pos.\n";} + $MD_pos_so_far += $op; + if ($verbose){ warn "MD pos so far: $MD_pos_so_far\n";} + if ($MD_pos_so_far < $del_pos){ + if ($verbose){ warn "Doesn't cover the deletion yet. Writing back out.\n";} + $MD_tag .= $op; + $new_MD .= $op; + if ($verbose){ warn "Setting new operation to: $el\n";} + $op = $el; # setting new $op + } + else{ + if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";} + ### splitting up the number of matching bases in number before and after the deletion + + my $pos_after_deletion = $MD_pos_so_far - $del_pos; + my $pos_before_deletion = $op - $pos_after_deletion; + if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";} + $MD_tag .= "${pos_before_deletion}^${deleted_bases}"; + $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}"; + if ($verbose){ warn "\$newMD after adjusting for the current deletion: $new_MD\n";} + + #adjusting the MD_position by the number of bases after the deletion + $MD_pos_so_far -= $pos_after_deletion; + if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; } + ### also appending the current element because we are writing out the rest of the MD-string unchanged to $new_MD + $new_MD .= $el; + + $deletions_processed += 1; + $this_deletion_processed = 1; + + if ($deletions_processed == $deletions_total){ # this was the last deletion of the read + if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";} + $MD_tag .= "${pos_after_deletion}"; + + ### also appending the current element because we are writing out the rest of the MD-string unchanged + if ($verbose){ warn "also appending the current element $el\n";} + $MD_tag .= $el; + ### Finally also adding the length of the deletion to $del_pos + $del_pos += $len[$index]; + if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} + } + else{ + if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";} + if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";} + + + ### This migh need looking at!! + + $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op; + if ($verbose){ warn "Current index = $current_md_index\n";} + + if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";} + $md_index_already_processed = $current_md_index - 1; + + if ($verbose){ warn "Exiting now and waiting for the next deletion\n";} + + ### Finally also adding the length of the deletion to $del_pos + $del_pos += $len[$index]; + $MD_pos_so_far += $len[$index]; + if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} + #setting $op to en empty string so it is not being processed as the last element + $op = ''; + # last; # exiting the loop and processing the CIGAR string further until we hit the next deletion + } + } + } + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} + } + else{ + if ($verbose){ warn "Operation so far was a word character: $op\n";} + if ($el =~ /\d+/){ + # processing the previous mismatch position + $MD_tag .= $op; + $new_MD .= $op; + $MD_pos_so_far += length($op); + if ($verbose){ warn "Writing out mismatching base $op and adding length ",length($op),"\n";} + } + else{ + # this should never occur since mismatches are followed by a 0 or another digit + die "current element is a another word character: $el. This should never happen!\n"; + } + if ($verbose){ warn "Setting new operation to: $el\n";} + $op = $el; # setting new $op + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} + } + } + + ### need to consider last element if it was a digit or number and we are expecting the deletion in the last element of the MD-tag + if ($op =~ /\d+/ and $deletions_processed < $deletions_total){ + if ($verbose){ warn "\n\nlast operation was $op\n";} + if ($verbose){ warn "Processing operation $op; deletion pos is $del_pos. MD so far was: $MD_pos_so_far\n";} + + $MD_pos_so_far += $op; + if ($verbose){ warn "Adding $op to MD pos so far: $MD_pos_so_far\n";} + if ($verbose){ warn "Deletions already processed: $deletions_processed, del total: $deletions_total\n\n";} + if ($MD_pos_so_far >= $del_pos){ + if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";} + ### splitting up the number of matching bases in number before and after the deletion + + my $pos_after_deletion = $MD_pos_so_far - $del_pos; + my $pos_before_deletion = $op - $pos_after_deletion; + if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";} + + $MD_tag .= "${pos_before_deletion}^${deleted_bases}"; + $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}"; + + #adjusting the MD_position by the number of bases after the deletion + $MD_pos_so_far -= $pos_after_deletion; + if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; } + + $deletions_processed += 1; + $this_deletion_processed = 1; + + if ($deletions_processed == $deletions_total){ # this was the last deletion of the read + if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";} + $MD_tag .= "${pos_after_deletion}"; + + } + else{ + if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";} + if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";} + + $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op; + if ($verbose){ warn "Current index = $current_md_index\n";} + + if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";} + # since we are no longer in the loop we don't have to subtract 1 from $current_md_index (tit hasn't been incremented in the first place...) + $md_index_already_processed = $current_md_index; + + if ($verbose){ warn "Exiting now and waiting for the next deletion\n";} + + $MD_pos_so_far += $len[$index]; + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} + } + ### Finally also adding the length of the deletion to $del_pos + $del_pos += $len[$index]; + if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} + } + else{ + die "Something went wrong, we haven't seen a deletion so far even though we should have...\n\n"; + } + } + + # forming a new @md + @md = split //,$new_MD; + $new_MD = ''; + if ($verbose){ warn "New \@md array: @md\n\n";} + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\nnew_MD so far: $new_MD\n\n";} + + } + else{ + die "Found CIGAR operations other than M, I, D or N: '$ops[$index]'. Not allowed at the moment\n"; + } + } + + } + if ($verbose){ warn "Returning MD-tag: $MD_tag\n";} + return $MD_tag; + +} + +### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might +### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels +# sub make_mismatch_string{ +# my $actual_seq = shift or die "Missing actual sequence\n"; +# my $ref_seq = shift or die "Missing reference sequence\n"; +# my $XX_tag = "XX:Z:"; + +# my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison + +# warn "'$tmp'\n"; sleep(1); +# my $prev_mm_pos = 0; + +# while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference +# my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch +# my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read +# $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other +# $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation) +# $prev_mm_pos = pos($tmp); # Position of last mismatch +# } +# my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence +# $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence +# return $XX_tag; +# } + + + +sub print_helpfile{ + print << "HOW_TO"; + + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + + +DESCRIPTION + + +The following is a brief description of command line options and arguments to control the Bismark +bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the +reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand +version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). +Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome +(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the +forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) +are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original +sequence from the genome and determine if there were any protected C's present or not. + +As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be +re-enabled by using --non_directional. + +The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old +'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best +alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below. + + +USAGE: bismark [options] {-1 -2 | } + + +ARGUMENTS: + + The path to the folder containing the unmodified reference genome + as well as the subfolders created by the Bismark_Genome_Preparation + script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/). + Bismark expects one or more fastA files in this folder (file extension: .fa + or .fasta). The path can be relative or absolute. The path may also be set as + '--genome_folder /path/to/genome/folder/'. + +-1 Comma-separated list of files containing the #1 mates (filename usually includes + "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must + correspond file-for-file and read-for-read with those specified in . + Reads may be a mix of different lengths. Bismark will produce one mapping result + and one report file per paired-end input file pair. + +-2 Comma-separated list of files containing the #2 mates (filename usually includes + "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must + correspond file-for-file and read-for-read with those specified in . + Reads may be a mix of different lengths. + + A comma- or space-separated list of files containing the reads to be aligned (e.g. + lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will + produce one mapping result and one report file per input file. + + +OPTIONS: + + +Input: + +--se/--single_end Sets single-end mapping mode explicitly giving a list of file names as . + The filenames may be provided as a comma [,] or colon [:] separated list. + +-q/--fastq The query input files (specified as , or are FASTQ + files (usually having extension .fg or .fastq). This is the default. See also + --solexa-quals. + +-f/--fasta The query input files (specified as , or are FASTA + files (usually having extensions .fa, .mfa, .fna or similar). All quality values + are assumed to be 40 on the Phred scale. FASTA files are expected to contain both + the read name and the sequence on a single line (and not spread over several lines). + +-s/--skip Skip (i.e. do not align) the first reads or read pairs from the input. + +-u/--upto Only aligns the first reads or read pairs from the input. Default: no limit. + +--phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. + +--phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. + +--solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled + (which can't). The formula for conversion is: + phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This + is usually the right option for use with (unconverted) reads emitted by the GA + Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. + +--solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) + reads emitted by GA Pipeline version 1.3 or later. Default: off. + +--path_to_bowtie The full path to the Bowtie (1 or 2) installation on your system. If not + specified it is assumed that Bowtie (1 or 2) is in the PATH. + + +Alignment: + +-n/--seedmms The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs + of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the + default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). + +-l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to + which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for + larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). + +-e/--maqerr Maximum permitted total of quality values at all mismatched read positions throughout + the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds + quality values to the nearest 10 and saturates at 30. This value is not relevant for + Bowtie 2. + +--chunkmbs The number of megabytes of memory a given thread is given to store path descriptors in + --best mode. Best-first search must keep track of many paths at once to ensure it is + always extending the path with the lowest cumulative cost. Bowtie tries to minimize the + memory impact of the descriptors, but they can still grow very large in some cases. If + you receive an error message saying that chunk memory has been exhausted in --best mode, + try adjusting this parameter up to dedicate more memory to the descriptors. This value + is not relevant for Bowtie 2. Default: 512. + +-I/--minins The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and + a paired-end alignment consists of two 20-bp alignments in the appropriate orientation + with a 20-bp gap between them, that alignment is considered valid (as long as -X is also + satisfied). A 19-bp gap would not be valid in that case. Default: 0. + +-X/--maxins The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and + a paired-end alignment consists of two 20-bp alignments in the proper orientation with a + 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). + A 61-bp gap would not be valid in that case. Default: 500. + +--multicore Sets the number of parallel instances of Bismark to be run concurrently. This forks the + Bismark alignment step very early on so that each individual Spawn of Bismark processes + only every n-th sequence (n being set by --multicore). Once all processes have completed, + the individual BAM files, mapping reports, unmapped or ambiguous FastQ files are merged + into single files in very much the same way as they would have been generated running Bismark + conventionally with only a single instance. + + If system resources are plentiful this is a viable option to speed up the alignment process + (we observed a near linear speed increase for up to --multicore 8 tested). However, please note + that a typical Bismark run will use several cores already (Bismark itself, 2 or 4 threads of + Bowtie/Bowtie2, Samtools, gzip etc...) and ~10-16GB of memory depending on the choice of aligner + and genome. WARNING: Bismark Parallel (BP?) is resource hungry! Each value of --multicore specified + will effectively lead to a linear increase in compute and memory requirements, so --multicore 4 for + e.g. the GRCm38 mouse genome will probably use ~20 cores and eat ~40GB or RAM, but at the same time + reduce the alignment time to ~25-30%. You have been warned. + + + +Bowtie 1 Reporting: + +-k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option + will be used by default. + +--best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum + (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in + terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred + quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both + have Phred quality 10. When --best is not specified, Bowtie may report alignments that + are sub-optimal in terms of stratum and/or quality (though an effort is made to report + the best alignment). --best mode also removes all strand bias. Note that --best does not + affect which alignments are considered "valid" by Bowtie, only which valid alignments + are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified. + Default: on. + +--no_best Disables the --best option which is on by default. This can speed up the alignment process, + e.g. for testing purposes, but for credible results it is not recommended to disable --best. + + +Output: + +--non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four + bisulfite strands will be reported. Default: OFF. + + (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary + to the original strands are merely theoretical and should not exist in reality. Specifying directional + alignments (which is the default) will only run 2 alignment threads to the original top (OT) + or bottom (OB) strands in parallel and report these alignments. This is the recommended option + for sprand-specific libraries). + +--pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al., + PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode, + as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT + and OB ones. Use this option only if you are certain that your libraries were constructed following + a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option + --pbat works only for FastQ files (in both Bowtie and Bowtie 2 mode) and using uncompressed + temporary files only). + +--sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are + split up into several smaller files to run concurrently and the output files are to be merged. + +--rg_tag Write out a Read Group tag to the resulting SAM/BAM file. This will write the following line to the + SAM header: \@RG PL: ILLUMINA ID:SAMPLE SM:SAMPLE ; to set ID and SM see --rg_id and --rg_sample. + In addition each read receives an RG:Z:RG-ID tag. Default: OFF. + +--rg_id Sets the ID field in the \@RG header line. The default is 'SAMPLE'. + +--rg_sample Sets the SM field in the \@RG header line; can't be set without setting --rg_id as well. The default is + 'SAMPLE'. + +--quiet Print nothing besides alignments. + +--vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead + of SAM format output. + +-un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will + appear as they did in the input, without any translation of quality values that may have + taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 + and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads + with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) + are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. + +--ambiguous Write all reads which produce more than one valid alignment with the same number of lowest + mismatches or other reads that fail to align uniquely to a file in the output directory. + Written reads will appear as. they did in the input, without any of the translation of quality + values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two + parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and + _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. + +-o/--output_dir Write all output files into this directory. By default the output files will be written into + the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt + to create it first. The path to the output folder can be either relative or absolute. + +--temp_dir Write temporary files to this directory instead of into the same directory as the input files. If + the specified folder does not exist, Bismark will attempt to create it first. The path to the + temporary folder can be either relative or absolute. + +--non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the + alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is + just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions. + The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches' + and 'XB:Z:number of mismatches' for read 2 of paired-end reads. + +--gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk + space. This option is available for most alignment modes but is not available for paired-end FastA + files. This option might be somewhat slower than writing out uncompressed files, but this awaits + further testing. + +--sam The output will be written out in SAM format instead of the default BAM format. Bismark will + attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't + been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found, + the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). + +--cram Writes the output to a CRAM file instead of BAM. This requires the use of Samtools 1.2 or higher. + +--cram_ref CRAM output requires you to specify a reference genome as a single FastA file. If this single-FastA + reference file is not supplied explicitly it will be regenerated from the genome .fa sequence(s) + used for the Bismark run and written to a file called 'Bismark_genome_CRAM_reference.mfa' into the + oputput directory. + +--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified + explicitly if Samtools is in the PATH already. + +--prefix Prefixes to the output filenames. Trailing dots will be replaced by a single one. For + example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. + +-B/--basename Write all output to files starting with this base file name. For example, '--basename foo' + would result in the files 'foo.bam' and 'foo_SE_report.txt' (or its paired-end equivalent). Takes + precedence over --prefix. + +--old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, + this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since + both the appended read IDs and custom FLAG values may cause problems with some downstream tools + such as Picard, new defaults were implemented as of version 0.8.3. + + + default old_flag + =================== =================== + Read 1 Read 2 Read 1 Read 2 + + OT: 99 147 67 131 + + OB: 83 163 115 179 + + CTOT: 147 99 67 131 + + CTOB: 163 83 115 179 + +--ambig_bam For reads that have multiple alignments a random alignment is written out to a special file ending in + '.ambiguous.bam'. The alignments are in Bowtie2 format and do not any contain Bismark specific + entries such as the methylation call etc. These ambiguous BAM files are intended to be used as + coverage estimators for variant callers. + +--nucleotide_coverage Calculates the mono- and di-nucleotide sequence composition of covered positions in the analysed BAM + file and compares it to the genomic average composition once alignments are complete by calling 'bam2nuc'. + Since this calculation may take a while, bam2nuc attempts to write the genomic sequence composition + into a file called 'genomic_nucleotide_frequencies.txt' indside the reference genome folder so it can + be re-used the next time round instead of calculating it once again. If a file 'nucleotide_stats.txt' is + found with the Bismark reports it will be automatically detected and used for the Bismark HTML report. + This option works only for BAM or CRAM files. + + +Other: + +-h/--help Displays this help file. + +-v/--version Displays version information. + + +BOWTIE 2 SPECIFIC OPTIONS + +--bowtie1 Uses Bowtie 1 instead of Bowtie 2, which might be a good choice for faster and very short + alignments. Bismark assumes that raw sequence data is adapter and/or quality trimmed where + appropriate. Default: off. + +--bowtie2 Default: ON. Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end + alignments, i.e. searches for alignments involving all read characters (also called + untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter + and/or quality trimmed where appropriate. Both small (.bt2) and large (.bt2l) Bowtie 2 + indexes are supported. + +Bowtie 2 alignment options: + +-N Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. + Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) + but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for + Bowtie 1 see -n). + +-L Sets the length of the seed substrings to align during multiseed alignment. Smaller values + make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is + used by default, which sets -L to 20. This option is only available for Bowtie 2 (for + Bowtie 1 see -l). + +--ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched + position to be the highest possible, regardless of the actual value. I.e. input is treated + as though all quality values are high. This is also the default behavior when the input + doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. + + +Bowtie 2 paired-end options: + +--no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if + it cannot find a concordant or discordant alignment for a pair. This option is invariable and + and on by default. + +--no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. + A discordant alignment is an alignment where both mates align uniquely, but that does not + satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior + and it is on by default. + +--dovetail It is possible, though unusual, for the mates to "dovetail", with the mates seemingly extending + "past" each other as in this example: + + Mate 1: GTCAGCTACGATATTGTTTGGGGTGACACATTACGC + Mate 2: TATGAGTCAGCTACGATATTGTTTGGGGTGACACAT + Reference: GCAGATTATATGAGTCAGCTACGATATTGTTTGGGGTGACACATTACGCGTCTTTGAC + + By default, dovetailing is considered inconsistent with concordant alignment, but setting --dovetail + causes Bowtie 2 to consider dovetailing alignments as concordant. This becomes relevant whenever + Reads are clipped from their 5' end prior to mapping, e.g. because of quality or bias issues. + --dovetail is set automatically for PBAT libraries. + + +Bowtie 2 effort options: + +-D Up to consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using + the alignments found so far. A seed extension "fails" if it does not yield a new best or a + new second-best alignment. Default: 15. + +-R is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. + When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of + mismatches allowed) at different offsets and searches for more alignments. A read is considered + to have repetitive seeds if the total number of seed hits divided by the number of seeds + that aligned at least once is greater than 300. Default: 2. + +Bowtie 2 parallelization options: + + +-p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores + and synchronize when parsing reads and outputting alignments. Searching for alignments is highly + parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. + E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint + by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads + library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will + automatically use the option '--reorder', which guarantees that output SAM records are printed in + an order corresponding to the order of the reads in the original input file, even when -p is set + greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and + setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then + if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally + correspond to input order in that case. + +Bowtie 2 Scoring options: + +--score_min Sets a function governing the minimum alignment score needed for an alignment to be considered + "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying + L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. + See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is + L,0,-0.2. + +--rdg , Sets the read gap open () and extend () penalties. A read gap of length N gets a penalty + of + N * . Default: 5, 3. + +--rfg , Sets the reference gap open () and extend () penalties. A reference gap of length N gets + a penalty of + N * . Default: 5, 3. + + +Bowtie 2 Reporting options: + +-most_valid_alignments This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is + deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the + default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the + effort expended to find valid alignments. + + For reference, this used to be the old (now deprecated) description of -M: + Bowtie 2 searches for at most +1 distinct, valid alignments for each read. The search terminates when it + can't find more distinct valid alignments, or when it finds +1 distinct alignments, whichever + happens first. Only the best alignment is reported. Information from the other alignments is used to + estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes + Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that + aligns many places. For reads that have more than +1 distinct, valid alignments, Bowtie 2 does not + guarantee that the alignment reported is the best possible in terms of alignment score. -M is + always used and its default value is set to 10. + + +'VANILLA' Bismark OUTPUT: + +Single-end output format (tab-separated): + + (1) + (2) + (3) + (4) + (5) + (6) + (7) + (8) + (9) +(11) + + +Paired-end output format (tab-separated): + (1) + (2) + (3) + (4) + (5) + (6) + (7) + (8) + (9) +(10) +(11) +(12) +(14) +(15) + + +Bismark SAM OUTPUT (default): + + (1) QNAME (seq-ID) + (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!)) + (3) RNAME (chromosome) + (4) POS (start position) + (5) MAPQ (always 255 for use with Bowtie) + (6) CIGAR + (7) RNEXT + (8) PNEXT + (9) TLEN +(10) SEQ +(11) QUAL (Phred33 scale) +(12) NM-tag (edit distance to the reference) +(13) MD-tag (base-by-base mismatches to the reference (handles indels) +(14) XM-tag (methylation call string) +(15) XR-tag (read conversion state for the alignment) +(16) XG-tag (genome conversion state for the alignment) +(17) XA/XB-tag (non-bisulfite mismatches) (optional!) + +Each read of paired-end alignments is written out in a separate line in the above format. + + +Last edited on 25 July 2016 +HOW_TO +} diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_mapping/bismark_bowtie2_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_mapping/bismark_bowtie2_wrapper.xml Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,561 @@ + + + Bisulfite reads mapper + + + samtools + bowtie2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (( + params['settingsType'] == "default" or + ( params['settingsType'] == "custom" and + params['isReportOutput'] is True ) + )) + + + + + (( + params['settingsType'] == "custom" and + params['bismark_stdout'] is True + )) + + + + + + + + + + + + + + + + + + + + + + + (( + params['settingsType'] == "custom" and + params['suppressed_read_file'] is True + )) + + + + + + + + + + + + + + (( + singlePaired['sPaired'] == "paired" and + params['settingsType'] == "custom" and + params['suppressed_read_file'] is True + )) + + + + + + + + + + + + + + + + + + (( + params['settingsType'] == "custom" and + params['unmapped_read_file'] is True + )) + + + + + + + + + + + + + + + + + (( + singlePaired['sPaired'] == "paired" and + params['settingsType'] == "custom" and + params['unmapped_read_file'] is True + )) + + + + + + + + + + + + + + + + + + + +T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). + | Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the forward strand, by doing this alignments will produce the same positions). + | These instances of Bowtie 2 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original sequence from the genome and determine if there were any protected C's present or not. + | + | As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be re-enabled by using non_directional mode. + | + | It is developed by Krueger F and Andrews SR. at the Babraham Institute. Krueger F, Andrews SR. (2011) Bismark: a flexible aligner and methylation caller for Bisulfite-Seq applications. Bioinformatics, 27, 1571-2. + +.. _Bismark: http://www.bioinformatics.babraham.ac.uk/projects/bismark/ + +---- + +**Know what you are doing** + + .. class:: warningmark + + | There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. + | In other words = running this tool with default parameters will probably not give you meaningful results. + | A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy. + | + + .. class:: warningmark + + | Make sure all your input reads are in the correct and same format. If thats not the case please adjust/convert the filetype with galaxy's build-in converters. + +.. __: http://www.bioinformatics.babraham.ac.uk/projects/bismark/ + +---- + +**Input formats** + + | Bismark accepts files in either Sanger FASTQ format (galaxy type *fastqsanger*), Illumina FASTQ format (galaxy type *fastqillumina*) or FASTA format (galaxy type *fasta*). Use the FASTQ Groomer to prepare your files. + +---- + +**A Note on Built-in Reference Genomes** + + | The default variant for all genomes is "Full", defined as all primary chromosomes (or scaffolds/contigs) including mitochondrial plus associated unmapped, plasmid, and other segments. + | When only one version of a genome is available in this tool, it represents the default "Full" variant. Some genomes will have more than one variant available. + | The "Canonical Male" or sometimes simply "Canonical" variant contains the primary chromosomes for a genome. For example a human "Canonical" variant contains chr1-chr22, chrX, chrY, and chrM. The "Canonical Female" variant contains the primary chromosomes excluding chrY. + +---- + +**Outputs** + +* The final output of Bismark is in SAM format by default, and has the following columns:: + + Column Description + -------- -------------------------------------------------------- + 1 QNAME seq-ID + 2 FLAG this flag tries to take the strand a bisulfite read + originated from into account + (this is different from ordinary DNA alignment flags!) + 3 RNAME chromosome + 4 POS start position + 5 MAPQ always 255 + 6 CIGAR extended CIGAR string + 7 MRNM Mate Reference sequence NaMe ('=' if same as RNAME) + 8 MPOS 1-based Mate POSition + 9 ISIZE Inferred insert SIZE + 10 SEQ query SEQuence on the same strand as the reference + 11 QUAL Phred33 scale + 12 NM-tag edit distance to the reference) + 13 XX-tag base-by-base mismatches to the reference. + This does not include indels. + 14 XM-tag methylation call string + 15 XR-tag read conversion state for the alignment + 16 XG-tag genome conversion state for the alignment + + + | Each read of paired-end alignments is written out in a separate line in the above format. + | + +* It looks like this (scroll sideways to see the entire example):: + + QNAME FLAG RNAME POS MAPQ CIAGR MRNM MPOS ISIZE SEQ QUAL OPT + HWI-EAS91_1_30788AAXX:1:1:1761:343 4 * 0 0 * * 0 0 AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh + HWI-EAS91_1_30788AAXX:1:1:1578:331 4 * 0 0 * * 0 0 GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh + +---- + +**Note on Bismark settings** + + | All of the options have a default value. You can change any of them. If any Bismark function is missing please contact the tool author or your Galaxy admin. + +---- + +**Bismark settings** + +* **If Paired-End Reads** + + * **Minimum insert size for valid paired-end alignments** + + | The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and a paired-end alignment consists of two 20-bp alignments in the appropriate orientation with a 20-bp gap between them, that alignment is considered valid (as long as -X is also satisfied). A 19-bp gap would not be valid in that case. Default: 0. + | + | Alignment option *-I/--minins * + + + * **Maximum insert size for valid paired-end alignments** + + | The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and a paired-end alignment consists of two 20-bp alignments in the proper orientation with a 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). A 61-bp gap would not be valid in that case. Default: 500. + | + | Alignment option *-X/--maxins * + + +* **Number of mismatches to be allowed in a seed alignment during multiseed alignment** + + | Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for Bowtie 1 see -n). + | + | Bowtie 2 Specific option *-N * + +* **Length of the seed substrings to align during multiseed alignment** + + | Sets the length of the seed substrings to align during multiseed alignment. Smaller values make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is used by default, which sets -L to 20. This option is only available for Bowtie 2 (for Bowtie 1 see -l). + | + | Bowtie 2 Specific option *-L * + + +* **How many consecutive seed extension attempts can fail before Bowtie 2 moves on** + + | Up to consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using the alignments found so far. A seed extension "fails" if it does not yield a new best or a new second-best alignment. Default: 15. + | + | Bowtie 2 Effort option *-D * + +* **Maximum number of times Bowtie 2 will re-seed reads with repetitive seeds** + + | is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of mismatches allowed) at different offsets and searches for more alignments. A read is considered to have repetitive seeds if the total number of seed hits divided by the number of seeds that aligned at least once is greater than 300. Default: 2. + | + | Bowtie 2 Effort option *-R * + +* **Only aligns the first N reads or read pairs from the input** + + | Only aligns the first reads or read pairs from the input. Default: no limit. + | + | Input option *-u/--upto * + +* **Skip (i.e. do not align) the first N reads or read pairs from the input** + + | Input option *-s/--skip* + +* **Disable looking for discordant alignments if it cannot find any concordant alignments** + + | Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. A discordant alignment is an alignment where both mates align uniquely, but that does not satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior and it is on by default. + | + | Bowtie 2 Paired-End option *--no-discordant* + +* **Disable Bowtie 2's behaviour to try to find alignments for the individual mates** + + | This option disables Bowtie 2's behavior to try to find alignments for the individual mates if it cannot find a concordant or discordant alignment for a pair. This option is invariable and on by default. + | + | Bowtie 2 Paired-End option *--no-mixed* + +* **Write ambiguous reads to an extra output file** + + | Write all reads which produce more than one valid alignment with the same number of lowest mismatches or other reads that fail to align uniquely to a file in the output directory. Written reads will appear as they did in the input, without any of the translation of quality values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. + | + | Output option *--ambiguous* + +* **Write all reads that could not be aligned to a file** + + | Write all reads that could not be aligned to a file in the output directory. Written reads will appear as they did in the input, without any translation of quality values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. + | + | Output option *-un/--unmapped* + +* **Offer all report files concatenated in one file** + + | Prints out a Bismark mapping report + +]]> + + + 10.1093/bioinformatics/btr167 + + \ No newline at end of file diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_mapping/bismark_genome_preparation --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_mapping/bismark_genome_preparation Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,632 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Cwd; +use Getopt::Long; +$|++; + + +## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +my $verbose; +my $help; +my $version; +my $man; +my $path_to_bowtie; +my $multi_fasta; +my $single_fasta; +my $bowtie2; +my $bowtie1; +my $parent_dir = getcwd(); + +my $genomic_composition; +my %genomic_freqs; # storing the genomic sequence composition +my %freqs; + +my $bismark_version = 'v0.16.3'; + +GetOptions ('verbose' => \$verbose, + 'help' => \$help, + 'man' => \$man, + 'version' => \$version, + 'path_to_bowtie:s' => \$path_to_bowtie, + 'single_fasta' => \$single_fasta, + 'bowtie2' => \$bowtie2, + 'bowtie1' => \$bowtie1, + 'genomic_composition' => \$genomic_composition, + ); + +if ($help or $man){ + print_helpfile(); + exit; +} + +if ($version){ + print << "VERSION"; + + Bismark - Bisulfite Mapper and Methylation Caller. + + Bismark Genome Preparation Version: $bismark_version + Copyright 2010-16 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/ + +VERSION + exit; +} + +my $genome_folder = shift @ARGV; # mandatory +my %chromosomes; # checking if chromosome names are unique (required) + +# Ensuring a genome folder has been specified +if ($genome_folder){ + unless ($genome_folder =~ /\/$/){ + $genome_folder =~ s/$/\//; + } + $verbose and print "Path to genome folder specified as: $genome_folder\n"; + chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; + + # making the genome folder path abolsolute so it won't break if the path was specified relative + $genome_folder = getcwd(); + unless ($genome_folder =~ /\/$/){ + $genome_folder =~ s/$/\//; + } +} +else{ + die "Please specify a genome folder to be used for bisulfite conversion\n\n"; +} + + +my $CT_dir; +my $GA_dir; + +if ($bowtie1){ + if ($bowtie2){ + die "You may not select both --bowtie1 and --bowtie2. Make your pick! (default is Bowtie2)\n"; + } + $bowtie2 = 0; + $verbose and print "Aligner to be used: Bowtie (1)\n"; +} +else{ # Bowtie 2 is now the default mode (as of 27 July 2015) + if ($bowtie2){ + $verbose and print "Aligner to be used: Bowtie 2 (user-defined)\n"; + } + else{ + $verbose and print "Aligner to be used: Bowtie 2 (default)\n"; + } + $bowtie2 = 1; +} + +if ($single_fasta){ + warn "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n"; + $multi_fasta = 0; +} +else{ + warn "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n"; + $single_fasta = 0; + $multi_fasta = 1; +} + +my @filenames = create_bisulfite_genome_folders(); + +if ($genomic_composition){ + get_genomic_frequencies(); + warn "Finished processing genomic nucleotide frequencies\n\n"; + %chromosomes = (); # resetting +} + +process_sequence_files (); + +launch_bowtie_indexer(); + +sub launch_bowtie_indexer{ + if ($bowtie2){ + warn "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n"; + } + else{ + warn "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n"; + } + print "Please be aware that this process can - depending on genome size - take several hours!\n"; + sleep(1); + + ### if the path to bowtie was specfified explicitely + if ($path_to_bowtie){ + if ($bowtie2){ + $path_to_bowtie =~ s/$/bowtie2-build/; + } + else{ + $path_to_bowtie =~ s/$/bowtie-build/; + } + } + ### otherwise we assume that bowtie-build is in the path + else{ + if ($bowtie2){ + $path_to_bowtie = 'bowtie2-build'; + } + else{ + $path_to_bowtie = 'bowtie-build'; + } + } + + $verbose and print "\n"; + + ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer) + my $pid = fork(); + + # parent process + if ($pid){ + sleep(1); + chdir $CT_dir or die "Unable to change directory: $!\n"; + $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n"; + $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n"; + + system ("$path_to_bowtie","-f","$file_list","BS_CT")== 0 or die "Parent process: Failed to build index\n"; + waitpid( $pid, 0 ); + $? == 0 or die "Parent process: Child process failed\n"; + } + + # child process + elsif ($pid == 0){ + sleep(2); + chdir $GA_dir or die "Unable to change directory: $!\n"; + $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n"; + $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n"; + + exec ("$path_to_bowtie","-f","$file_list","BS_GA"); + } + + # if the platform doesn't support the fork command we will run the indexing processes one after the other + else{ + print "Forking process was not successful, therefore performing the indexing sequentially instead\n"; + # sleep(10); + + ### moving to CT genome folder + $verbose and warn "Preparing to index CT converted genome in $CT_dir\n"; + chdir $CT_dir or die "Unable to change directory: $!\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "$file_list\n\n"; + # sleep(2); + system ("$path_to_bowtie","-f","$file_list","BS_CT"); + @fasta_files=(); + $file_list= ''; + + ### moving to GA genome folder + $verbose and warn "Preparing to index GA converted genome in $GA_dir\n"; + chdir $GA_dir or die "Unable to change directory: $!\n"; + @fasta_files = <*.fa>; + $file_list = join (',',@fasta_files); + $verbose and print "$file_list\n\n"; + # sleep(2); + exec ("$path_to_bowtie","-f","$file_list","BS_GA"); + } +} + + +sub process_sequence_files { + + my ($total_CT_conversions,$total_GA_conversions) = (0,0); + $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n"; + sleep (3); + + $verbose and print "conversions performed:\n"; + $verbose and print join("\t",'chromosome','C->T','G->A'),"\n"; + + + ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file + ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle + ### This is now the default option + + if ($multi_fasta){ + ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files + my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa"; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa"; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + foreach my $filename(@filenames){ + my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); + open (IN,$filename) or die "Failed to read from sequence file $filename $!\n"; + # warn "Reading chromosome information from $filename\n\n"; + + ### first line needs to be a fastA header + my $first_line = ; + chomp $first_line; + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + ### Exiting if a chromosome with the same name was present already + if (exists $chromosomes{$chromosome_name}){ + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else{ + $chromosomes{$chromosome_name}++; + } + + ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. + unless ($multi_fasta){ + my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; + $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; + $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry + print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry + + + while (){ + + ### in case the line is a new fastA header + if ($_ =~ /^>/){ + ### printing out the stats for the previous chromosome + $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; + ### resetting the chromosome transliteration counters + ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); + + ### Extracting chromosome name from the additional FastA header + $chromosome_name = extract_chromosome_name($_); + + ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. + unless ($multi_fasta){ + my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; + $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; + $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; + print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; + } + + else{ + my $sequence = uc$_; + + ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group) + + $sequence =~ s/[^ATCGN\n\r]/N/g; + + ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion) + + my $CT_sequence = $sequence; + my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts + $total_CT_conversions += $CT_transliterations_performed; + $chromosome_CT_conversions += $CT_transliterations_performed; + + print CT_CONVERT $CT_sequence; + + ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse- + ### complementing the forward strand and then C->T converting it) + + my $GA_sequence = $sequence; + my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand + $total_GA_conversions += $GA_transliterations_performed; + $chromosome_GA_conversions += $GA_transliterations_performed; + + print GA_CONVERT $GA_sequence; + + } + } + $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; + } + close (CT_CONVERT) or die "Failed to close filehandle: $!\n"; + close (GA_CONVERT) or die "Failed to close filehandle: $!\n"; + + + print "\nTotal number of conversions performed:\n"; + print "C->T:\t$total_CT_conversions\n"; + print "G->A:\t$total_GA_conversions\n"; + + warn "\nStep II - Genome bisulfite conversions - completed\n\n\n"; +} + +sub get_genomic_frequencies{ + + warn "Calculating genomic frequencies (this may take several minutes depending on genome size) ...\n"; + warn "="x164,"\n"; + read_genome_into_memory($parent_dir); + foreach my $chr (keys %chromosomes){ + warn "Processing chromosome >> $chr <<\n"; + process_sequence($chromosomes{$chr}); + } + + %genomic_freqs = %freqs; + + ### Attempting to write a genomic nucleotide frequency table out to the genome folder so we can re-use it next time without the need to re-calculate + ### if this fails + if ( open (FREQS,'>',"${genome_folder}genomic_nucleotide_frequencies.txt") ){ + warn "Writing genomic nucleotide frequencies to the file >${genome_folder}genomic_nucleotide_frequencies.txt< for future re-use\n"; + foreach my $f(sort keys %genomic_freqs){ + warn "Writing count of (di-)nucleotide: $f\t$genomic_freqs{$f}\n"; + print FREQS "$f\t$genomic_freqs{$f}\n"; + } + close FREQS or warn "Failed to close filehandle FREQS: $!\n\n"; + } + else{ + warn "Failed to write out file ${genome_folder}genomic_nucleotide_frequencies.txt because of: $!. Skipping writing out genomic frequency table\n\n"; + } +} + + +sub process_sequence{ + + my $seq = shift; + my $mono; + my $di; + + foreach my $index (0..(length$seq)-1){ + my $counted = 0; + if ($index%10000000==0){ + # warn "Current index number is $index\n"; + } + + $mono = substr($seq,$index,1); + unless ( $mono eq 'N'){ + $freqs{$mono}++; + } + + unless ( ($index + 2) > length$seq ){ + $di = substr($seq,$index,2); + if (index($di,'N') < 0) { + $freqs{$di}++; + } + } + } + +} + +sub extract_chromosome_name { + ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well + my $fasta_header = shift; + if ($fasta_header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$fasta_header); + return $chromosome_name; + } + else{ + die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; + } +} + + +sub create_bisulfite_genome_folders{ + + $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n"; + + if ($path_to_bowtie){ + unless ($path_to_bowtie =~ /\/$/){ + $path_to_bowtie =~ s/$/\//; + } + if (chdir $path_to_bowtie){ + if ($bowtie2){ + $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n"; + } + else{ + $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n"; + } + } + else{ + die "There was an error with the path to bowtie: $!\n"; + } + } + + chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; + + + # Exiting unless there are fastA files in the folder + my @filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@filenames){ + @filenames = <*.fasta>; + } + + unless (@filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n"; + } + + warn "Bisulfite Genome Indexer version $bismark_version (last modified 25 August 2015)\n\n"; + sleep (3); + + # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists + my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/"; + unless (-d $bisulfite_dir){ + mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n"; + } + else{ + print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n"; + sleep(5); + } + + chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n"; + $CT_dir = "${bisulfite_dir}CT_conversion/"; + $GA_dir = "${bisulfite_dir}GA_conversion/"; + + # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion) + # converted version of the genome + unless (-d $CT_dir){ + mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $CT_dir\n"; + } + unless (-d $GA_dir){ + mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $GA_dir\n"; + } + + # moving back to the original genome folder + chdir $genome_folder or die "Could't move to directory $genome_folder $!"; + # $verbose and print "Moved back to genome folder folder $genome_folder\n"; + warn "\nStep I - Prepare genome folders - completed\n\n\n"; + return @filenames; +} + +sub read_genome_into_memory{ + + ## reading in and storing the specified genome in the %chromosomes hash + chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; + warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; + + my @chromosome_filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@chromosome_filenames){ + @chromosome_filenames = <*.fasta>; + } + unless (@chromosome_filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; + } + + foreach my $chromosome_filename (@chromosome_filenames){ + + # skipping the tophat entire mouse genome fasta file + next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); + + open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; + ### first line needs to be a fastA header + my $first_line = ; + chomp $first_line; + $first_line =~ s/\r//; # removing /r carriage returns + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + my $sequence; + while (){ + chomp; + $_ =~ s/\r//; # removing /r carriage returns + + if ($_ =~ /^>/){ + ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) + if (exists $chromosomes{$chromosome_name}){ + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else { + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; + } + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + } + ### resetting the sequence variable + $sequence = ''; + ### setting new chromosome name + $chromosome_name = extract_chromosome_name($_); + } + else{ + $sequence .= uc$_; + } + } + + if (exists $chromosomes{$chromosome_name}){ + warn "chr $chromosome_name (",length $sequence ," bp)\t"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; + } + else{ + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; + } + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + } + } + warn "\n"; + chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; +} + + + + +sub print_helpfile{ + print << 'HOW_TO'; + + +DESCRIPTION + +This script is supposed to convert a specified reference genome into two different bisulfite +converted versions and index them for alignments with Bowtie 2 (default), or Bowtie 1. The first +bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs +converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference +genome folder. Once the bisulfite conversion has been completed the program will fork and launch +two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware +that the indexing process can take up to several hours; this will mainly depend on genome size +and system resources. + + + +The following is a brief description of command line options and arguments to control the +Bismark Genome Preparation: + + +USAGE: bismark_genome_preparation [options] + + +OPTIONS: + +--help/--man Displays this help filea and exits. + +--version Displays version information and exits. + +--verbose Print verbose output for more details or debugging. + +--path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system + (depending on which aligner/indexer you intend to use). Unless this path + is specified it is assumed that Bowtie is in the PATH. + +--bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: ON). + +--bowtie1 This will create bisulfite indexes for Bowtie 1. (Default: OFF). + +--single_fasta Instruct the Bismark Indexer to write the converted genomes into + single-entry FastA files instead of making one multi-FastA file (MFA) + per chromosome. This might be useful if individual bisulfite converted + chromosomes are needed (e.g. for debugging), however it can cause a + problem with indexing if the number of chromosomes is vast (this is likely + to be in the range of several thousand files; the operating system can + only handle lists up to a certain length, and some newly assembled + genomes may contain 20000-500000 contigs of scaffold files which do exceed + this list length limit). + +--genomic_composition Calculate and extract the genomic sequence composition for mono and di-nucleotides + and write the genomic composition table 'genomic_nucleotide_frequencies.txt' to the + genome folder. This may be useful later on when using bam2nuc or the Bismark option + --nucleotide_coverage. + + +ARGUMENTS: + + The path to the folder containing the genome to be bisulfite converted. + The Bismark Genome Preparation expects one or more fastA files in the folder + (with the file extension: .fa or .fasta). Specifying this path is mandatory. + + +This script was last modified on 07 July 2016. +HOW_TO +} diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_mapping/bismark_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_mapping/bismark_wrapper.py Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,379 @@ +#!/usr/bin/env python + +import argparse +import os +import shutil +import subprocess +import sys +import shlex +import tempfile +import fileinput +import fileinput +from glob import glob + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + + #Parse Command Line + parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') + parser.add_argument( '-p', '--num-threads', dest='num_threads', + type=int, default=4, help='Use this many threads to align reads. The default is 4.' ) + + parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' ) + + parser.add_argument( '--bowtie2', action='store_true', default=False, help='Running bismark with bowtie2 and not with bowtie.' ) + + # input options + parser.add_argument( '--own-file', dest='own_file', help='' ) + parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) + parser.add_argument( '-O', '--output', dest='output' ) + + + parser.add_argument( '--output-report-file', dest='output_report_file' ) + parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) + + parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False) + + + parser.add_argument( '-1', '--mate1', dest='mate1', + help='The forward reads file in Sanger FASTQ or FASTA format.' ) + parser.add_argument( '-2', '--mate2', dest='mate2', + help='The reverse reads file in Sanger FASTQ or FASTA format.' ) + parser.add_argument( '--sort-bam', dest='sort_bam', action="store_true" ) + + parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', + help='Additional output file with unmapped reads (single-end).' ) + parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l', + help='File name for unmapped reads (left, paired-end).' ) + parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', + help='File name for unmapped reads (right, paired-end).' ) + + + parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', + help='Additional output file with suppressed reads (single-end).' ) + parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', + help='File name for suppressed reads (left, paired-end).' ) + parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', + help='File name for suppressed reads (right, paired-end).' ) + parser.add_argument( '--stdout', dest='output_stdout', + help='File name for the standard output of bismark.' ) + + + parser.add_argument( '--single-paired', dest='single_paired', + help='The single-end reads file in Sanger FASTQ or FASTA format.' ) + + parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format') + parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format') + parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" ) + + + parser.add_argument( '--skip-reads', dest='skip_reads', type=int ) + parser.add_argument( '--qupto', type=int) + + + # paired end options + parser.add_argument( '-I', '--minins', dest='min_insert' ) + parser.add_argument( '-X', '--maxins', dest='max_insert' ) + parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" ) + parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" ) + + #parse general options + # default 20 + parser.add_argument( '--seed-len', dest='seed_len', type=int) + # default 15 + parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int ) + # default 0 + parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int ) + # default 2 + parser.add_argument( '--max-reseed', dest='max_reseed', type=int ) + """ + # default 70 + parser.add_argument( '--maqerr', dest='maqerr', type=int ) + """ + + """ + The number of megabytes of memory a given thread is given to store path + descriptors in --best mode. Best-first search must keep track of many paths + at once to ensure it is always extending the path with the lowest cumulative + cost. Bowtie tries to minimize the memory impact of the descriptors, but + they can still grow very large in some cases. If you receive an error message + saying that chunk memory has been exhausted in --best mode, try adjusting + this parameter up to dedicate more memory to the descriptors. Default: 512. + """ + parser.add_argument( '--chunkmbs', type=int, default=512 ) + + args = parser.parse_args() + + # Create bismark index if necessary. + index_dir = "" + if args.own_file: + """ + Create a temporary index with the offered files from the user. + Utilizing the script: bismark_genome_preparation + bismark_genome_preparation --bowtie2 hg19/ + """ + tmp_index_dir = tempfile.mkdtemp() + index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) ) + try: + """ + Create a hard link pointing to args.own_file named 'index_path'.fa. + """ + os.symlink( args.own_file, index_path + '.fa' ) + except Exception, e: + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + stop_err( 'Error in linking the reference database.\n' + str( e ) ) + # bismark_genome_preparation needs the complete path to the folder in which the database is stored + if args.bowtie2: + cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir ) + else: + cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir ) + if args.bismark_path: + if os.path.exists(args.bismark_path): + # add the path to the bismark perl scripts, that is needed for galaxy + cmd_index = os.path.join(args.bismark_path, cmd_index) + else: + # assume the same directory as that script + cmd_index = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd_index) + try: + tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name + tmp_stderr = open( tmp, 'wb' ) + proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + except Exception, e: + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + stop_err( 'Error indexing reference sequence\n' + str( e ) ) + index_dir = tmp_index_dir + else: + # bowtie path is the path to the index directory and the first path of the index file name + index_dir = os.path.dirname( args.index_path ) + + # Build bismark command + + """ + Bismark requires a large amount of temporary disc space. If that is not available, for example on a cluster you can hardcode the + TMP to some larger space. It's not recommended but it works. + """ + #tmp_bismark_dir = tempfile.mkdtemp( dir='/data/0/galaxy_db/tmp/' ) + tmp_bismark_dir = tempfile.mkdtemp() + output_dir = os.path.join( tmp_bismark_dir, 'results') + cmd = 'bismark %(args)s --bam --temp_dir %(tmp_bismark_dir)s --gzip -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' + + if args.fasta: + # the query input files (specified as mate1,mate2 or singles) are FastA + cmd = '%s %s' % (cmd, '--fasta') + elif args.fastq: + cmd = '%s %s' % (cmd, '--fastq') + + if args.bismark_path: + # add the path to the bismark perl scripts, that is needed for galaxy + if os.path.exists(args.bismark_path): + cmd = os.path.join(args.bismark_path, cmd) + else: + # assume the same directory as that script + cmd = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd) + + arguments = { + 'genome_folder': index_dir, + 'args': '', + 'tmp_bismark_dir': tmp_bismark_dir, + 'output_dir': output_dir, + } + + additional_opts = '' + # Set up the reads + if args.mate_paired: + # paired-end reads library + reads = '-1 %s ' % ( args.mate1 ) + reads += ' -2 %s ' % ( args.mate2 ) + additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert) + else: + # single paired reads library + reads = ' %s ' % ( args.single_paired ) + + + if not args.bowtie2: + # use bowtie specific options + #additional_opts += ' --best ' # bug in bismark, --best is not available as option. Only --non-best, best-mode is activated by default + if args.seed_mismatches: + # --seedmms + additional_opts += ' -n %s ' % args.seed_mismatches + if args.seed_len: + # --seedlen + additional_opts += ' -l %s ' % args.seed_len + + # alignment options + if args.bowtie2: + additional_opts += ' -p %s --bowtie2 ' % (int(args.num_threads/2)) #divides by 2 here since bismark will spawn 2 (original top and original bottom) jobs with -p threads each + if args.seed_mismatches: + additional_opts += ' -N %s ' % args.seed_mismatches + if args.seed_len: + additional_opts += ' -L %s ' % args.seed_len + if args.seed_extention_attempts: + additional_opts += ' -D %s ' % args.seed_extention_attempts + if args.max_reseed: + additional_opts += ' -R %s ' % args.max_reseed + if args.no_discordant: + additional_opts += ' --no-discordant ' + if args.no_mixed: + additional_opts += ' --no-mixed ' + """ + if args.maqerr: + additional_opts += ' --maqerr %s ' % args.maqerr + """ + if args.skip_reads: + additional_opts += ' --skip %s ' % args.skip_reads + if args.qupto: + additional_opts += ' --qupto %s ' % args.qupto + if args.phred64: + additional_opts += ' --phred64-quals ' + if args.suppress_header: + additional_opts += ' --sam-no-hd ' + if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r): + additional_opts += ' --un ' + if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r): + additional_opts += ' --ambiguous ' + + arguments.update( {'args': additional_opts, 'reads': reads} ) + + # Final bismark command: + cmd = cmd % arguments + # Run + try: + tmp_out = tempfile.NamedTemporaryFile().name + tmp_stdout = open( tmp_out, 'wb' ) + tmp_err = tempfile.NamedTemporaryFile().name + tmp_stderr = open( tmp_err, 'wb' ) + proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) + returncode = proc.wait() + + if returncode != 0: + tmp_stdout.close() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp_err, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + + raise Exception, stderr + tmp_stdout.close() + tmp_stderr.close() + + # TODO: look for errors in program output. + except Exception, e: + stop_err( 'Error in bismark:\n' + str( e ) ) + + # collect and copy output files + if args.output_report_file: + output_report_file = open(args.output_report_file, 'w+') + for line in fileinput.input(glob( os.path.join( output_dir, '*report.txt') )): + output_report_file.write(line) + output_report_file.close() + + + if args.output_suppressed_reads: + if glob(os.path.join( output_dir, '*ambiguous_reads.txt')): + shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) + if args.output_suppressed_reads_l: + if glob(os.path.join(output_dir, '*ambiguous_reads_1.txt')): + shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l ) + if args.output_suppressed_reads_r: + if glob(os.path.join(output_dir, '*ambiguous_reads_2.txt')): + shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r ) + + if args.output_unmapped_reads: + if glob(os.path.join(output_dir, '*unmapped_reads.txt')): + shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads ) + if args.output_unmapped_reads_l: + if glob(os.path.join(output_dir, '*unmapped_reads_1.txt')): + shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l ) + if args.output_unmapped_reads_r: + if glob(os.path.join(output_dir, '*unmapped_reads_2.txt')): + shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) + + try: + """ + merge all bam files + """ + #tmp_out = tempfile.NamedTemporaryFile( dir=output_dir ).name + tmp_stdout = open( tmp_out, 'wab' ) + #tmp_err = tempfile.NamedTemporaryFile( dir=output_dir ).name + tmp_stderr = open( tmp_err, 'wab' ) + + tmp_res = tempfile.NamedTemporaryFile( dir= output_dir).name + + bam_files = glob( os.path.join( output_dir, '*.bam') ) + if len( bam_files ) > 1: + cmd = 'samtools merge -@ %s -f %s %s ' % ( args.num_threads, tmp_res, ' '.join( bam_files ) ) + + proc = subprocess.Popen( args=shlex.split( cmd ), stdout=subprocess.PIPE ) + + returncode = proc.wait() + tmp_stdout.close() + tmp_stderr.close() + if returncode != 0: + raise Exception, open( tmp_stderr.name ).read() + else: + tmp_res = bam_files[0] + + bam_path = "%s" % tmp_res + + if os.path.exists( bam_path ): + if args.sort_bam: + cmd = 'samtools sort -@ %s %s sorted_bam' % (args.num_threads, bam_path) + proc = subprocess.Popen( args=shlex.split( cmd ) ) + returncode = proc.wait() + if returncode != 0: + raise Exception("Error during '%s'" % cmd) + shutil.move( 'sorted_bam.bam', args.output ) + else: + shutil.move( bam_path, args.output ) + else: + stop_err( 'BAM file no found:\n' + str( bam_path ) ) + + + # TODO: look for errors in program output. + except Exception, e: + stop_err( 'Error in merging bam files:\n' + str( e ) ) + + + if args.output_stdout: + # copy the temporary saved stdout from bismark + shutil.move( tmp_out, args.output_stdout ) + + # Clean up temp dirs + if args.own_file: + if os.path.exists( tmp_index_dir ): + shutil.rmtree( tmp_index_dir ) + if os.path.exists( tmp_bismark_dir ): + shutil.rmtree( tmp_bismark_dir ) + if os.path.exists( output_dir ): + shutil.rmtree( output_dir ) + +if __name__=="__main__": __main__() diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_methyl_extractor/bismark2bedGraph --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_methyl_extractor/bismark2bedGraph Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,830 @@ +#!/usr/bin/env perl +use warnings; +use strict; +$|++; +use Getopt::Long; +use Cwd; +use Carp; + +## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +my $bismark2bedGraph_version = 'v0.16.3'; + +my @bedfiles; +my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total +my @sorting_files; + +my ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir) = process_commandline(); + +warn "Using these input files: @sorting_files\n"; +warn "\nSummary of parameters for bismark2bedGraph conversion:\n"; +warn '='x54,"\n"; +warn "bedGraph output:\t\t$bedGraph_output\n"; +warn "output directory:\t\t>$output_dir<\n"; +if ($remove){ + warn "remove whitespaces:\t\tyes\n"; +} +else{ + warn "remove whitespaces:\t\tno\n"; +} +if ($CX_context){ + warn "CX context:\t\t\tyes\n"; +} +else{ + warn "CX context:\t\t\tno (CpG context only, default)\n"; +} +if ($no_header){ + warn "No-header selected:\t\tyes\n"; +} +else{ + warn "No-header selected:\t\tno\n"; +} + +if ($ample_mem){ + warn "Sorting method:\t\t\tArray-based (faster, but larger memory footprint)\n"; +} +else{ + warn "Sorting method:\t\t\tUnix sort-based (smaller memory footprint, but slower)\n"; +} +unless($ample_mem){ + warn "Sort buffer size:\t\t$sort_size\n"; +} +warn "Coverage threshold:\t\t$coverage_threshold\n"; + + +warn "="x77,"\n"; +warn "Methylation information will now be written into a bedGraph and coverage file\n"; +warn "="x77,"\n\n"; +sleep (2); + +### deciding which files to use for bedGraph conversion +foreach my $filename (@sorting_files){ + + ### DETERMINING THE FULL PATH OF INPUT FILES + if ($filename =~ /^(.*\/)(.*)$/){ # if files are in a different output folder we extract the filename again + # warn "folder name: $1\nfilename: $2\n\n"; + chdir $1 or die "Failed to change directory to $1\n"; # $1 might be a relative path + $input_dir = getcwd(); # this will always be the full path + $filename = $2; + # warn "Full Input folder: $input_dir\nFilename: $filename\n\n"; sleep (1); + chdir $parent_dir or die "Failed to move back to the parent directory\n\n"; # moving back + } + else{ + $input_dir = $parent_dir; + } + $input_dir .= '/'; + + if ($CX_context){ + # push @bedfiles,$output_dir.$filename; + push @bedfiles,$input_dir.$filename; + } + else{ ## CpG context only (default) + if ($filename =~ /^CpG/){ # only testing the actual filename without the path information + push @bedfiles,$input_dir.$filename; # we are adding the full path to the filename + } + else{ + # skipping CHH or CHG files + } + } +} + +if (@bedfiles){ + warn "Using the following files as Input:\n"; + print join ("\t",@bedfiles),"\n\n"; + sleep (2); +} +else{ + die "It seems that you are trying to generate bedGraph files for files not starting with CpG.... Please specify the option '--CX' and try again\n\n"; +} + +open (OUT,"| gzip -c - > ${output_dir}${bedGraph_output}") or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!\n"; +warn "Writing bedGraph to file: $bedGraph_output\n"; +print OUT "track type=bedGraph\n"; + +my $coverage_output = $bedGraph_output; +unless ($coverage_output =~ s/bedGraph\.gz$/bismark.cov.gz/){ + $coverage_output =~ s/$/.bismark.cov.gz/; +} + +open (COVERAGE,"| gzip -c - > ${output_dir}${coverage_output}") or die "Problems writing to the coverage output detected. File path: '$output_dir'\tfile name: '$coverage_output' $!\n\n"; +warn "Also writing out a coverage file including counts methylated and unmethylated residues to file: $coverage_output\n"; + +if ($zero){ + my $zero_coverage_output = $bedGraph_output; + unless ($zero_coverage_output =~ s/bedGraph$/bismark.zero.cov/){ + $zero_coverage_output =~ s/$/.bismark.zero.cov/; + } + + open (ZEROCOVERAGE,'>',$output_dir.$zero_coverage_output) or die "Problems writing to the zero-based coverage output detected. File path: '$output_dir'\tfile name: '$\zero_coverage_output' $!\n\n"; + warn "Also writing out a 0-based, half-open coverage file including counts methylated and unmethylated residues to file: $zero_coverage_output\n"; +} +warn "\n"; + +my %temp_fhs; +my @temp_files; # writing all context files (default CpG only) to these files prior to sorting +my %chr_lengths; # storing chromosome lenghts in '--ample_memory' mode + +### changing to the output directory +unless ($output_dir eq ''){ # default + chdir $output_dir or die "Failed to change directory to $output_dir\n"; + warn "Changed directory to $output_dir\n"; +} + +if ($gazillion){ + if (scalar @bedfiles == 1){ + warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Sorting everything in memory instead of writing out individual chromosome files ...\n"; + } + else{ + warn "The genome of interest was specified to contain gazillions of chromosomes or scaffolds. Merging all input files and sorting everything in memory instead of writing out individual chromosome files...\n"; + my $merge = "$bedGraph_output.methylation_calls.merged"; + open (MERGE,'>',$merge) or die "Failed to write to temporary merged file $merge: $!\n"; + warn "Writing all merged methylation calls to temp file $merge\n\n"; sleep(2); + push @temp_files, $merge; + } +} + +foreach my $infile (@bedfiles) { + + # warn "Processing $infile\n"; + + if ($remove) { + warn "Now replacing whitespaces in the sequence ID field of the Bismark methylation extractor output $infile prior to bedGraph conversion\n\n"; + + if ($infile =~ /gz$/){ + open (READ,"gunzip -c $infile |") or die $!; + } + else{ + open (READ,$infile) or die $!; + } + + my $removed_spaces_outfile = $infile; + $removed_spaces_outfile =~ s/$/.spaces_removed.txt/; + $removed_spaces_outfile =~ s/.*\///;# replacing everything up to the last slash in the filename + + warn "Attempting to write to file ${output_dir}${removed_spaces_outfile}\n\n"; + open (REM,'>',$output_dir.$removed_spaces_outfile) or die "Couldn't write to file $removed_spaces_outfile: $!\n"; + + unless ($no_header){ + $_ = ; ### Bismark version header + print REM $_; ### Bismark version header + } + + while () { + chomp; + my ($id,$strand,$chr,$pos,$context) = (split (/\t/)); + $id =~ s/\s+/_/g; + print REM join ("\t",$id,$strand,$chr,$pos,$context),"\n"; + } + + close READ or die $!; + close REM or die $!; + + ### changing the infile name to the new file without spaces + $infile = $removed_spaces_outfile; # at this stage we are already in the output directory so it should pick it up correctly + } + + # opening infile + if ($infile =~ /gz$/){ + open (IN,"gunzip -c $infile |") or die "Couldn't find file '$infile': $!\n"; + } + else{ + open (IN,$infile) or die "Couldn't find file '$infile': $!\n"; + } + + if ($infile =~ /\//){ # if input files are in a different folders we extract the filename again + $infile =~ s/.*\///;# replacing everything up to the last slash in the filename + # warn "Renamed Infile: $infile\n"; + } + + ### People these days seem to be aligning their data to newly assembled genomes more and more, which sometimes conist of up to half a million scaffolds instead of ~23 chromosomes. This + ### does normally clash with the operating system's limit of files that can be open for writing at the same time, and it is difficult and probably not advisable to increase this + ### limit (some even say there is a reason for the OS doing so...). + ### To still allow then generation of bedGraph files we will in these cases sort everything using the Linux sort command instead, which will sort by chromosome and position (the + ### chromosome sorting is not carried out for chromosome sorted files which makes the sort MUCH faster). + + if ($gazillion){ + # using all infiles instead of sorting + if (scalar @bedfiles == 1){ + push @temp_files, $infile; + } + else{ + ## always ignoring the version header + unless ($no_header){ + $_ = ; ### Bismark version header + } + + while () { + if ($_ =~ /^Bismark /){ + warn "Found Bismark version information. Skipping this line (should still work fine) but consider using '--no_header' next time...\n"; + next; + } + print MERGE; + } + warn "Finished writing methylation calls from $infile to merged temp file\n"; + } + } + else{ + warn "Now writing methylation information for file >>$infile<< to individual files for each chromosome\n"; + + ## always ignoring the version header + unless ($no_header){ + $_ = ; ### Bismark version header + } + + while () { + if ($_ =~ /^Bismark /){ + warn "Found Bismark version information. Skipping this line (should still work fine) but consider losing '--no_header' next time...\n"; + next; + } + + chomp; + + my ($chr,$pos) = (split (/\t/))[2,3]; + + ### If --ample_mem was specified we are keeping track of the highest position for each chromosome as this will determine the size of the array we need to create in the next step + if ($ample_mem){ + ### setting the first position for this chromosome + unless (defined $chr_lengths{$chr} ){ + $chr_lengths{$chr} = $pos; + } + # for all subsequent postions for this chromosome + if ($pos > $chr_lengths{$chr} ){ + $chr_lengths{$chr} = $pos; # set the current position as the new highest position + } + } + + # warn "This is the chromosome name before replacing '|' characters:\t$chr\n\n"; + $chr =~ s/\|/_/g; # replacing pipe ('|') characters in the file names + # warn "This is the chromosome name AFTER replacing '|' characters:\t$chr\n\n"; + unless (exists $temp_fhs{$chr}) { # Including the infile name to the temporary chromosome files to enable parallel processing of multiple files at the same time + + my $temp_file_name = $infile.'.chr'.$chr.'.methXtractor.temp'; + # warn "Using temp file name: $temp_file_name\n"; sleep(1); + + open ($temp_fhs{$chr},'>',$infile.'.chr'.$chr.'.methXtractor.temp') or die "Failed to open filehandle: $!"; + push @temp_files, $temp_file_name; # storing temp files as we open them instead + } + + print {$temp_fhs{$chr}} "$_\n"; + } + + warn "Finished writing out individual chromosome files for $infile\n"; + } +} + +# closing temporary filehandles to force writing out buffered content +foreach my $temp_fh(keys %temp_fhs){ + close $temp_fhs{$temp_fh} or warn "Failed to close temporary filehandle $temp_fhs{$temp_fh}: $!\n"; +} + +### printing out the determined maximum position for each chromosome +if ($ample_mem){ + foreach my $chr (sort keys %chr_lengths){ + warn "Highest determined position for chromosome $chr:\t\t$chr_lengths{$chr} bp\n"; + } + warn "\n"; +} + +unless ($gazillion){ + warn "\n"; + warn "Collecting temporary chromosome file information... Processing the following input file(s):\n"; + warn join ("\n",@temp_files),"\n\n"; + sleep (1); +} + +if ($gazillion){ + if (scalar @bedfiles > 1){ + close (MERGE) or die "Failed to close filehandle MERGE: $!\n"; + } +} + +foreach my $in (@temp_files) { + + if ($sort_size){ + warn "Sorting input file $in by positions (using -S of $sort_size)\n" unless ($ample_mem); + } + + my $ifh; + + my $name; + my $meth_state; + my $chr = ""; + my $pos = 0; + my $meth_state2; + + my $last_pos; + my $last_chr; + + ### If the user specified to have a lot of RAM available (probably in the range of > 16GB for 2 arrays of human genome Chromosome 1) we will sort the methylation calls in two big arrays instead of using the Unix sort command + if ($ample_mem){ + # warn "Generating enormous array instead of sorting the file. This may temporily use quite a bit of memory (RAM)!\n\n"; + + my @meth_count; + my @unmeth_count; + + open ($ifh,$in) or die "Couldn't read from temporary file '$in': $!\n"; + + while (my $line = <$ifh>){ + next if ($line =~ /^Bismark/); + chomp $line; + + ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line; + + unless ($last_pos and $last_chr){ + $last_chr = $chr; + $last_pos = $pos; + } + unless (@meth_count and @unmeth_count){ + warn "Setting maximum position of arrays \@meth_count and \@unmeth_count for chromosome $chr to $chr_lengths{$chr}\n"; + @meth_count = (0) x $chr_lengths{$chr}; + @unmeth_count = (0) x $chr_lengths{$chr}; + # warn "length of array meth count: ",scalar @meth_count,"\n"; + warn "Finished generating arrays\n"; + # sleep(1); + } + # warn "Chromosome\tStart Position\tEnd Position\tMethylation Percentage\n"; sleep(1); + # print join ("\t",$name, $meth_state, $chr, $pos, $meth_state2),"\n"; + # sleep(1); + + # if ($last_chr ne $chr) { + # die "Reached new chromosome '$chr' which mustn't happen from pre-sorted files (previous chromosome was: '$last_chr')\n"; + # } + + my $validated = validate_methylation_call($meth_state, $meth_state2); # as a comment, methylation calls in Unknown context (U, u) would fail this check, but they should be ignored by the methylation extractor anyway + unless($validated){ + warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n"; + next; + } + + if ($meth_state eq '+'){ + # warn "increasing meth $pos by 1\n"; sleep(1); + $meth_count[$pos-1]++; + } + else{ + $unmeth_count[$pos-1]++; + # warn "increasing unmeth $pos by 1\n"; sleep(1); + } + } + + close $ifh or die $!; + + warn "Now printing methylation information for this chromosome\n"; + # warn "length of array meth count: ",scalar @meth_count,"\n"; + # warn "chr\tposition\tcount methylated\tcount unmethylated\tcount total\n"; + foreach my $index (0..$#meth_count){ + my $totalcount = $meth_count[$index] + $unmeth_count[$index]; + if ($totalcount > 0){ + # warn "$index\t$meth_count[$index]\t$unmeth_count[$index]\t$totalcount\n"; + # sleep(1); + + my $bed_pos = $index; ### bedGraph coordinates are 0 based + my $one_based_pos = $bed_pos + 1; + + my $meth_percentage; + ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($meth_count[$index]/$totalcount) * 100) : ($meth_percentage = undef); + + if (defined $meth_percentage){ + + # as of version 0.9.1 we will by default write out both a bedGraph and a more detailed coverage file + + # this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (half-open. Clever, huh?) + print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n"; + + # this is the coverage file. Coordinates are 1-based + print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n"; + + # this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end + if ($zero){ + print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$meth_count[$index]\t$unmeth_count[$index]\n"; + } + + } + } + } + + @meth_count = (); + @unmeth_count = (); + + } + ### default: we assume that the user wants to use the Linux Sort command. This is quite a bit slower, but features a much smaller memory footprint + else{ + my $sort_dir = './'; # there has been a cd into the output_directory already + # my $sort_dir = $output_dir; + # if ($sort_dir eq ''){ + # $sort_dir = './'; + # } + + if ($gazillion){ + if ($in =~ /gz$/){ + open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k3,3V -k4,4n |" or die "Input file could not be sorted. $!\n"; + } + else{ + open $ifh, "sort -S $sort_size -T $sort_dir -k3,3V -k4,4n $in |" or die "Input file could not be sorted. $!\n"; + } + ### Comment by Volker Brendel, Indiana University + ### "The -k3,3V sort option is critical when the sequence names are numbered scaffolds (without left-buffering of zeros). Omit the V, and things go very wrong in the tallying of reads." + } + else{ + ### this sort command was used previously and sorts according to chromosome in addition to position. Since the files are being sorted according to chromosomes anyway, + ### we may drop the -k3,3V option. It has been reported that this will result in a dramatic speed increase + if ($in =~ /gz$/){ + open $ifh, "gunzip -c $in | sort -S $sort_size -T $sort_dir -k4,4n |" or die "Input file could not be sorted. $!\n"; + } + else{ + open $ifh, "sort -S $sort_size -T $sort_dir -k4,4n $in |" or die "Input file could not be sorted. $!\n"; + } + } + + while (my $line = <$ifh>) { + next if ($line =~ /^Bismark/); + chomp $line; + #warn "full line:\n$line\n"; + $last_chr = $chr; + $last_pos = $pos; + ($name, $meth_state, $chr, $pos, $meth_state2) = split "\t", $line; + #warn "$name, $meth_state, $chr, $pos, $meth_state2\n"; sleep(1); + if (($last_pos ne $pos) || ($last_chr ne $chr)) { + generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; + @methylcalls = qw (0 0 0); + } + + my $validated = validate_methylation_call($meth_state, $meth_state2); + unless($validated){ + warn "Methylation state of sequence ($name) in file ($in) on line $. is inconsistent (meth_state is $meth_state, meth_state2 = $meth_state2)\n"; + next; + } + if ($meth_state eq "+") { + $methylcalls[0]++; + $methylcalls[2]++; + } else { + $methylcalls[1]++; + $methylcalls[2]++; + } + } + + + $last_chr = $chr; + $last_pos = $pos; + if ($methylcalls[2] > 0) { + generate_output($last_chr,$last_pos) if $methylcalls[2] > 0; + } + + close $ifh or die $!; + + @methylcalls = qw (0 0 0); # resetting @methylcalls + + } + + ### deleting temporary files (only needed if --gazillion hasn't been specified + if ($gazillion and scalar @bedfiles == 1){ + # if there was only 1 file to sort this will be the input file, which obviously shouldn't be removed + } + else{ + my $delete = unlink $in; + if ($delete) { + warn "Successfully deleted the temporary input file $in\n\n"; + } + else { + warn "The temporary inputfile $in could not be deleted $!\n\n"; + } + } +} + +close OUT or die $!; +close COVERAGE or die $!; +if ($zero){ + close ZEROCOVERAGE or die $!; +} + +exit 0; + + + +sub validate_methylation_call{ + my $meth_state = shift; + croak "Missing (+/-) methylation call" unless defined $meth_state; + my $meth_state2 = shift; + croak "Missing alphabetical methylation call" unless defined $meth_state2; + my $is_consistent; + ($meth_state2 =~ /^z/i) ? ($is_consistent = check_CpG_methylation_call($meth_state, $meth_state2)) + : ($is_consistent = check_nonCpG_methylation_call($meth_state,$meth_state2)); + return 1 if $is_consistent; + return 0; +} + +sub check_CpG_methylation_call{ + my $meth1 = shift; + my $meth2 = shift; + return 1 if($meth1 eq "+" && $meth2 eq "Z"); + return 1 if($meth1 eq "-" && $meth2 eq "z"); + return 0; +} + +sub check_nonCpG_methylation_call{ + my $meth1 = shift; + my $meth2 = shift; + return 1 if($meth1 eq "+" && $meth2 eq "C"); + return 1 if($meth1 eq "+" && $meth2 eq "X"); + return 1 if($meth1 eq "+" && $meth2 eq "H"); + return 1 if($meth1 eq "-" && $meth2 eq "c"); + return 1 if($meth1 eq "-" && $meth2 eq "x"); + return 1 if($meth1 eq "-" && $meth2 eq "h"); + return 0; +} + +sub generate_output{ + my $methcount = $methylcalls[0]; + my $nonmethcount = $methylcalls[1]; + my $totalcount = $methylcalls[2]; + my $last_chr = shift; + my $last_pos = shift; + croak "Should not be generating output if there's no reads to this region" unless ($totalcount > 0); + croak "Total counts ($totalcount) is not the sum of the methylated ($methcount) and unmethylated ($nonmethcount) counts" if ($totalcount != ($methcount + $nonmethcount) ); + + my $bed_pos = $last_pos - 1; ### Bismark coordinates are 1 based whereas bedGraph coordinates are 0 based. + my $meth_percentage; + ($totalcount >= $coverage_threshold) ? ($meth_percentage = ($methcount/$totalcount) * 100) : ($meth_percentage = undef); + # $meth_percentage =~ s/(\.\d\d).+$/$1/ unless $meth_percentage =~ /^Below/; + if (defined $meth_percentage){ + + # this is the bedGraph file, the starting position is 0-based, the end position is 1-based! (clever, huh?) + my $one_based_pos = $bed_pos + 1; + print OUT "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\n"; + + # this is the coverage file. Coordinates are 1-based + print COVERAGE "$last_chr\t$one_based_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n"; + + # this is an optional 0-based, half-open coverage file. Coordinates are 0-based start and 1-based end + if ($zero){ + print ZEROCOVERAGE "$last_chr\t$bed_pos\t$one_based_pos\t$meth_percentage\t$methcount\t$nonmethcount\n"; + } + } + +} + +sub process_commandline{ + my $help; + my $output_dir; + my $bedGraph_output; + my $no_header; + my $coverage_threshold; # Minimum number of reads covering before calling methylation status + my $remove; + my $counts; + my $CX_context; + my $sort_size; + my $version; + my $gazillion; + my $ample_mem; + my $zero; + my $input_dir; + + my $command_line = GetOptions ('help|man' => \$help, + 'dir=s' => \$output_dir, + 'o|output=s' => \$bedGraph_output, + 'no_header' => \$no_header, + "cutoff=i" => \$coverage_threshold, + "remove_spaces" => \$remove, + "counts" => \$counts, + "CX|CX_context" => \$CX_context, + "buffer_size=s" => \$sort_size, + 'version' => \$version, + 'gazillion|scaffolds' => \$gazillion, + 'ample_memory' => \$ample_mem, + "zero_based" => \$zero, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($version){ + print << "VERSION"; + + + Bismark Methylation Extractor Module - + bismark2bedGraph + + Bismark Extractor Version: $bismark2bedGraph_version + Copyright 2010-15 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + @sorting_files = @ARGV; + + ### no files provided + unless (@sorting_files){ + warn "You need to provide one or more Bismark methylation caller files to create an individual C methylation bedGraph output. Please respecify!\n\n"; + sleep(2); + + print_helpfile(); + exit; + } + + ### PARENT DIRECTORY + my $parent_dir = getcwd(); + # warn "parent directory is: $parent_dir\n"; + + ### OUTPUT DIR PATH + if (defined $output_dir){ + + unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + unless (-d $output_dir){ + mkdir $output_dir or die "Failed to create output directory $output_dir: $!\n\n"; + warn "Created output directory $output_dir\n"; + } + + ### want to get an absolute path for the output directory instead of a relative one + chdir $output_dir or die "Failed to move into output directory '$output_dir': $!\n\n"; + $output_dir = getcwd(); + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + # warn "output directory is: $output_dir\n"; + + # changing back to the parent directory + chdir $parent_dir or die "Failed to move back into parent directory '$parent_dir': $!\n\n"; + + } + + } + else{ + $output_dir = ''; + } + + unless (defined $bedGraph_output){ + die "Please provide the name of the output file using the option -o/--output filename\n"; + } + + if ($bedGraph_output =~ /\//){ # this is supposed a filename and not a path name + die "Please specify a file name without any path information (or use --dir if necessary)\n\n"; + } + + unless ($bedGraph_output =~ /\.gz$/){ + $bedGraph_output = "${bedGraph_output}.gz"; ### 22 07 2015: Output will be gzip compressed + } + + ### NO HEADER + unless ($no_header){ + $no_header = 0; + } + + ### remove white spaces in read ID (needed for sorting using the sort command + unless ($remove){ + $remove = 0; + } + + ### COVERAGE THRESHOLD FOR bedGraph OUTPUT + if (defined $coverage_threshold){ + unless ($coverage_threshold > 0){ + die "Please select a coverage greater than 0 (positive integers only)\n"; + } + } + else{ + $coverage_threshold = 1; + } + + ### SORT buffer size + if (defined $sort_size){ + unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ + die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; + } + } + else{ + $sort_size = '2G'; + } + + unless ($CX_context){ + $CX_context = 0; + } + + unless ($counts){ + $counts = 1; + } + + if ($gazillion){ + if ($ample_mem){ + die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; + } + } + + return ($bedGraph_output,$parent_dir,$output_dir,$remove,$CX_context,$no_header,$sort_size,$coverage_threshold,$counts,$gazillion,$ample_mem,$zero,$input_dir); +} + + +sub print_helpfile{ + print < [methylation extractor input files] + +Methylation extractor input files: These files are required to start with CpG... in order for the +script to correctly work out the sequence context when using CpG context only (default). If all cytosine +contexts are selected ('--CX_context'), all input files will be used regardless of their file file name(s). + + +-o/--output Name of the output file, mandatory. + +--dir Output directory. Output is written to the current directory if not specified explicitly. + +--cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide + before its methylation percentage is reported. Default: 1. + +--remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. + +--CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered + in the experiment irrespective of its sequence context. This applies to both forward and + reverse strands. Please be aware that this option may generate large temporary and output files + and may take a long time to sort (up to many hours). Default: OFF. + (i.e. Default = CpG context only). + +--buffer_size This allows you to specify the main memory sort buffer when sorting the methylation information. + Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or + a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. + (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. + Defaults to 2G. + +--scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of + scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to + individual chromosome files. These errors were caused by the operating system's limit + of the number of filehandle that can be written to at any one time (typically 1024; to + find out this limit on Linux, type: ulimit -a). + To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort + methylation calls into individual chromosome files. Instead, all input files are + temporarily merged into a single file (unless there is only a single file), and this + file will then be sorted by both chromosome AND position using the Unix sort command. + Please be aware that this option might take a looooong time to complete, depending on + the size of the input files, and the memory you allocate to this process (see --buffer_size). + Nevertheless, it seems to be working. + +--ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will + instead use two arrays to sort methylated and unmethylated calls, respectively. This may result + in a faster sorting process for very large files, but this comes at the cost of a larger memory + footprint (as an estimate, two arrays of the length of (the largest) human chromosome 1 (nearly + 250 million bp) temporarily consume around 16GB of RAM). Note however that due to the overheads + of creating and looping through arrays this option might in fact be *slower* for small-ish + files (up to a few million alignments). Note also that this option is not currently compatible + with options '--scaffolds/--gazillion'. + +--zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start + and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file, + instead of using 1-based coordinates throughout. Default: OFF. + + + +The bedGraph output looks like this (tab-delimited; 0-based start coords, 1-based end coords): +============================================================================================== + +track type=bedGraph (header line) + + + + + +The coverage output looks like this (tab-delimited, 1-based genomic coords; optional zero-based, half-open coords with '--zero_based'): +======================================================================================================================================= + + + + + Script last modified: 09 December 2015 + +EOF + ; + exit 1; +} diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_methyl_extractor/bismark_methylation_extractor --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_methyl_extractor/bismark_methylation_extractor Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,5907 @@ +#!/usr/bin/env perl +use warnings; +use strict; +$|++; +use Getopt::Long; +use Cwd; +use Carp; +use FindBin qw($Bin); +use lib "$Bin/../lib"; + +## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +my @filenames; # input files +my %counting; +my $parent_dir = getcwd(); + +my %fhs; + +my $version = 'v0.16.3'; +my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore) = process_commandline(); + + +### only needed for bedGraph output +my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files +my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total +my @bedfiles; + +### only needed for genome-wide cytosine methylation report +my %chromosomes; + +my %mbias_1; +my %mbias_2; + + +############################################################################################## +### Summarising Run Parameters +############################################################################################## + +### METHYLATION EXTRACTOR + +warn "Summarising Bismark methylation extractor parameters:\n"; +warn '='x63,"\n"; + +if ($single){ + if ($vanilla){ + warn "Bismark single-end vanilla format specified\n"; + } + else{ + warn "Bismark single-end SAM format specified (default)\n"; # default + } +} +elsif ($paired){ + if ($vanilla){ + warn "Bismark paired-end vanilla format specified\n"; + } + else{ + warn "Bismark paired-end SAM format specified (default)\n"; # default + } +} + +warn "Number of cores to be used: $multicore\n"; + +if ($single){ + if ($ignore){ + warn "First $ignore bp will be disregarded when processing the methylation call string\n"; + } + if ($ignore_3prime){ + warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string\n"; + } + +} +else{ ## paired-end + if ($ignore){ + warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n"; + } + if ($ignore_r2){ + warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; + } + + if ($ignore_3prime){ + warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string of Read 1\n"; + } + if ($ignore_3prime_r2){ + warn "Last $ignore_3prime_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; + } + + +} + + +if ($full){ + warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; +} +if ($merge_non_CpG){ + warn "Merge CHG and CHH context to non-CpG context specified\n"; +} +### output directory +if ($output_dir eq ''){ + warn "Output will be written to the current directory ('$parent_dir')\n"; +} +else{ + warn "Output path specified as: $output_dir\n"; +} + + +sleep (1); + +### BEDGRAPH + +if ($bedGraph){ + warn "\n\nSummarising bedGraph parameters:\n"; + warn '='x63,"\n"; + + if ($counts){ + warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t \ncoverage format:\t \n\n"; + } + else{ + warn "Generating additional sorted output in bedGraph format (output format: )\n"; + } + + ### Zero-based coordinates + if ($zero){ + warn "Writing out an additional coverage file (ending in zero.cov) with 0-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n"; + } + + warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; + + if ($CX_context){ + warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; + } + + if ($remove){ + warn "White spaces in read ID names will be removed prior to sorting\n"; + } + + if ($ample_mem){ + warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n"; + } + elsif (defined $sort_size){ + warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; + } + else{ + warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; + } + + + + sleep (1); + + if ($cytosine_report){ + warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; + warn '='x63,"\n"; + warn "Generating comprehensive genome-wide cytosine report\n(output format: )\n"; + + + if ($CX_context){ + warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting cytosine methylation in CpG context only (default)\n"; + } + + if ($split_by_chromosome){ + warn "Splitting the cytosine report output up into individual files for each chromosome\n"; + } + + ### Zero-based coordinates + if ($zero){ + warn "Using zero-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n"; + } + else{ # default, 1-based coords + warn "Using 1-based genomic coordinates (default)\n"; + } + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + warn "Genome folder was specified as $genome_folder\n"; + } + else{ + $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; + warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; + } + sleep (1); + } +} + +warn "\n"; +sleep (1); + +###################################################### +### PROCESSING FILES +###################################################### + +foreach my $filename (@filenames){ + # resetting counters and filehandles + %fhs = (); + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + methylation_call_strings => 0, + ); + + @sorting_files = (); + @bedfiles = (); + + %mbias_1 = (); + %mbias_2 = (); + + + ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor + unless ($vanilla){ + if ($paired){ + test_positional_sorting($filename); + } + } + + my ($pid,$pids,$report_basename) = process_Bismark_results_file($filename); + + if ($pid == 0){ + warn "Finished processing child process. Exiting..\n"; + + # ### Closing all filehandles of the child process so that the Bismark methylation extractor output doesn't get truncated due to buffering issues + # foreach my $fh (keys %fhs) { + # if ($fh =~ /^[1230]$/) { + # foreach my $context (keys %{$fhs{$fh}}) { + # $fhs{$fh}->{$context}->flush; + # } + # } + # else{ + # $fhs{$fh}->flush; + # } + # } + exit 0; + } + + ### + if ($pid and $multicore > 1){ + warn "Now waiting for all child processes to complete\n"; + sleep(1); + + ### we need to ensure that we wait for all child processes to be finished before continuing + # warn "here are the child IDs: @$pids\n"; + # warn "Looping through the child process IDs:\n"; + + foreach my $id (@$pids){ + # print "$id\t"; + my $kid = waitpid ($id,0); + # print "Returned: $kid\nExit status: $?\n"; + unless ($? == 0){ + warn "\nChild process terminated with exit signal: '$?'\n\n"; + } + } + } + + ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues + foreach my $fh (keys %fhs) { + if ($fh =~ /^[1230]$/) { + foreach my $context (keys %{$fhs{$fh}}) { + close $fhs{$fh}->{$context} or die $!; + } + } + else{ + close $fhs{$fh} or die $!; + } + } + + ### We need to stitch together a main splitting report from all individual parent/child processes + if ($multicore > 1){ + merge_individual_splitting_reports($report_basename); + print_splitting_report(); + + merge_individual_mbias_reports($report_basename); # this updates the main %mbias_1 and %mbias_2 data structures so we can proceed normally + + } + + unless ($mbias_off){ + ### printing out all M-Bias data + produce_mbias_plots ($filename); produce_mbias_plots ($filename); + + } + + unless ($mbias_only){ + delete_unused_files(); + } + + if ($bedGraph){ + + my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $out =~ s/gz$//; + $out =~ s/sam$//; + $out =~ s/bam$//; + $out =~ s/txt$//; + $out =~ s/$/bedGraph/; + + my $bedGraph_output = $out; + my @args; + + if ($remove){ + push @args, '--remove'; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($no_header){ + push @args, '--no_header'; + } + if ($gazillion){ + push @args, '--gazillion'; + } + if ($ample_mem){ + push @args, '--ample_memory'; + } + if ($zero){ + push @args, "--zero"; + } + + # if ($counts){ + # push @args, "--counts"; + # } + + push @args, "--buffer_size $sort_size"; + push @args, "--cutoff $coverage_threshold"; + push @args, "--output $bedGraph_output"; + push @args, "--dir '$output_dir'"; + + ### adding all files to be sorted to @args + foreach my $f (@sorting_files){ + push @args, $f; + } + + # print join "\t",@args,"\n"; + + system ("$Bin/bismark2bedGraph @args"); + + warn "Finished BedGraph conversion ...\n\n"; + sleep(1); + + # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!"; + # warn "Writing bedGraph to file: $bedGraph_output\n"; + # process_bedGraph_output(); + # close OUT or die $!; + + ### genome-wide cytosine methylation report requires bedGraph processing anyway + if ($cytosine_report){ + + @args = (); # resetting @args + my $cytosine_out = $out; + $cytosine_out =~ s/bedGraph$//; + + if ($CX_context){ + $cytosine_out =~ s/$/CX_report.txt/; + } + else{ + $cytosine_out =~ s/$/CpG_report.txt/; + } + + push @args, "--output $cytosine_out"; + push @args, "--dir '$output_dir'"; + push @args, "--genome '$genome_folder'"; + push @args, "--parent_dir '$parent_dir'"; + + if ($zero){ + push @args, "--zero"; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($split_by_chromosome){ + push @args, '--split_by_chromosome'; + } + if ($gzip){ + push @args, '--gzip'; + } + + my $coverage_output = $bedGraph_output; + $coverage_output =~ s/bedGraph$/bismark.cov.gz/; + + push @args, $coverage_output; # this will be the infile + # warn "Handing over the following infile: $coverage_output\n"; sleep (5); + system ("$Bin/coverage2cytosine @args"); + + warn "\n\nFinished generating genome-wide cytosine report\n\n"; + } + } +} + +sub merge_individual_splitting_reports{ + + my $report_basename = shift; + + my @splitting_reports; # only needed in multi-core mode to generate an overall report + foreach my $ext (1..$multicore){ + push @splitting_reports, "$report_basename.$ext"; + } + warn "\nMerging individual splitting reports into overall report: '$report_basename'\n"; + warn "Merging from these individual files:\n"; + print join ("\n",@splitting_reports),"\n\n"; + sleep(1); + + ########## + # resetting the counter first + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + methylation_call_strings => 0, + ); + + # repopulating the merged counter + foreach my $file (@splitting_reports){ + open (IR,$file) or die $!; + while (){ + chomp; + my ($context,$count) = (split /\t/); + if ($context){ + if ($context =~ /^Total C to T conversions in CpG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_unmethylated_CpG_count} += $count; + } + elsif ($context =~ /^Total C to T conversions in CHG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_unmethylated_CHG_count} += $count; + } + elsif ($context =~ /^Total C to T conversions in CHH context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_unmethylated_CHH_count} += $count; + } + elsif ($context =~ /^Total methylated C\'s in CpG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_meCpG_count} += $count; + } + elsif ($context =~ /^Total methylated C\'s in CHG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_meCHG_count} += $count; + } + elsif ($context =~ /^Total methylated C\'s in CHH context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_meCHH_count} += $count; + } + elsif ($context =~ /^line count/){ + # warn "Line count\ncount: $count\n"; + $counting{sequences_count} = $count; # always the same + } + elsif ($context =~ /^meth call strings/){ + # warn "Meth call strings\ncount: $count\n"; + $counting{methylation_call_strings} += $count; + } + } + } + } + + # deleting the individual reports afterwards + foreach my $file (@splitting_reports){ + unlink $file; + } +} + + +sub merge_individual_mbias_reports{ + + my $report_basename = shift; + + my @mbias_reports; # only needed in multi-core mode to generate an overall report + foreach my $ext (1..$multicore){ + push @mbias_reports, "$report_basename.${ext}.mbias"; + } + warn "\nMerging individual M-bias reports into overall M-bias statistics from these $multicore individual files:\n"; + print join ("\n",@mbias_reports),"\n\n"; + + + ########## + # resetting the counters first, then repopulating them + %mbias_1 = (); + %mbias_2 = (); + + # repopulating the merged counter + foreach my $file (@mbias_reports){ + open (IR,$file) or die $!; + + my $context; + my $read; + + while (){ + chomp; + # warn "$_\n"; sleep(1); + if ($_ =~ /context/){ + $context = $1 if ($_ =~ /(\D{3}) context/); + # warn "Context set as $context\n"; + + if ($_ =~ /R2/){ + $read = 'R2'; + } + else{ + $read = 'R1'; + } + # warn "Setting read identity to '$read'\n"; + + # reading in 2 additional lines (===========, and header line) + $_ = ; + #warn "discarding line $_\n"; + $_ = ; + #warn "discarding line $_\n"; + next; + } + else{ + + if ($_ eq ''){ + # empty line, only occurs after a context has finished and before a new context starts + next; + } + + my ($pos,$meth,$unmeth) = (split /\t/); + # warn "$pos\t$meth\t$unmeth\n"; sleep(1); + if ($read eq 'R1'){ + $mbias_1{$context}->{$pos}->{meth} += $meth; + $mbias_1{$context}->{$pos}->{un} += $unmeth; + } + elsif ($read eq 'R2'){ + $mbias_2{$context}->{$pos}->{meth} += $meth; + $mbias_2{$context}->{$pos}->{un} += $unmeth; + } + } + } + close IR or warn "Had trouble closing filehandle for $file: $!\n"; + } + + # deleting the individual reports afterwards + foreach my $file (@mbias_reports){ + unlink $file; + } +} + + +sub delete_unused_files{ + + warn "Deleting unused files ...\n\n"; sleep(1); + + my $index = 0; + + while ($index <= $#sorting_files){ + if ($sorting_files[$index] =~ /gz$/){ + open (USED,"gunzip -c $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; + } + else{ + open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; + } + + my $used = 0; + + while (){ + next if (/^Bismark/); + if ($_){ + $used = 1; + last; + } + } + + if ($used){ + warn "$sorting_files[$index] contains data ->\tkept\n"; + ++$index; + } + else{ + + my $delete = unlink $sorting_files[$index]; + + if ($delete){ + warn "$sorting_files[$index] was empty ->\tdeleted\n"; + } + else{ + warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n" + } + + ### we also need to remove the element from @sorting_files + splice @sorting_files, $index, 1; + } + } + warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately) +} + +sub produce_mbias_plots{ + + my $filename = shift; + + my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $mbias =~ s/gz$//; + $mbias =~ s/sam$//; + $mbias =~ s/bam$//; + $mbias =~ s/cram$//; + $mbias =~ s/txt$//; + my $mbias_graph_1 = my $mbias_graph_2 = $mbias; + $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png'; + $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png'; + + $mbias =~ s/$/M-bias.txt/; + + open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n"; + + # determining maximum read length + my $max_length_1 = 0; + my $max_length_2 = 0; + + foreach my $context (keys %mbias_1){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ + $max_length_1 = $pos unless ($max_length_1 >= $pos); + } + } + if ($paired){ + foreach my $context (keys %mbias_2){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ + $max_length_2 = $pos unless ($max_length_2 >= $pos); + } + } + } + + if ($single){ + warn "Determining maximum read length for M-Bias plot\n"; + warn "Maximum read length of Read 1: $max_length_1\n\n"; + } + else{ + warn "Determining maximum read lengths for M-Bias plots\n"; + warn "Maximum read length of Read 1: $max_length_1\n"; + warn "Maximum read length of Read 2: $max_length_2\n\n"; + } + # sleep(3); + + my @mbias_read1; + my @mbias_read2; + + #Check whether the module GD::Graph:lines is installed + my $gd_graph_installed = 0; + eval{ + require GD::Graph::lines; + GD::Graph::lines->import(); + }; + + unless($@) { # syntax or routine error variable, set if something goes wrong in the last eval{ require ...} + $gd_graph_installed = 1; + + #Check whether the module GD::Graph::colour is installed + eval{ + require GD::Graph::colour; + GD::Graph::colour->import(qw(:colours :lists :files :convert)); + }; + + if ($@) { + warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; + sleep(2); + $gd_graph_installed = 0; + } + + + } + else{ + warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; + sleep(2); + } + + + my $graph_title; + my $graph1; + my $graph2; + + if ( $gd_graph_installed){ + $graph1 = GD::Graph::lines->new(800,600); + if ($paired){ + $graph2 = GD::Graph::lines->new(800,600); + } + } + + foreach my $context (qw(CpG CHG CHH)){ + @{$mbias_read1[0]} = (); + + if ($paired){ + print MBIAS "$context context (R1)\n================\n"; + $graph_title = 'M-bias (Read 1)'; + } + else{ + print MBIAS "$context context\n===========\n"; + $graph_title = 'M-bias'; + } + print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_1){ + + unless (defined $mbias_1{$context}->{$pos}->{meth}){ + $mbias_1{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_1{$context}->{$pos}->{un}){ + $mbias_1{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; + + print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + push @{$mbias_read1[0]},$pos; + + if ($context eq 'CpG'){ + push @{$mbias_read1[1]},$percent; + push @{$mbias_read1[4]},$coverage; + } + elsif ($context eq 'CHG'){ + push @{$mbias_read1[2]},$percent; + push @{$mbias_read1[5]},$coverage; + } + elsif ($context eq 'CHH'){ + push @{$mbias_read1[3]},$percent; + push @{$mbias_read1[6]},$coverage; + } + } + print MBIAS "\n"; + } + + if ( $gd_graph_installed){ + + add_colour(nice_blue => [31,120,180]); + add_colour(nice_orange => [255,127,0]); + add_colour(nice_green => [51,160,44]); + add_colour(pale_blue => [153,206,227]); + add_colour(pale_orange => [253,204,138]); + add_colour(pale_green => [191,230,207]); + + $graph1->set( + x_label => 'position (bp)', + y1_label => '% methylation', + y2_label => '# methylation calls', + title => $graph_title, + line_width => 2, + x_max_value => $max_length_1, + x_min_value => 0, + y_tick_number => 10, + y_label_skip => 2, + y1_max_value => 100, + y1_min_value => 0, + y_label_skip => 2, + y2_min_value => 0, + x_label_skip => 5, + x_label_position => 0.5, + x_tick_offset => -1, + bgclr => 'white', + transparent => 0, + two_axes => 1, + use_axis => [1,1,1,2,2,2], + legend_placement => 'RC', + legend_spacing => 6, + legend_marker_width => 24, + legend_marker_height => 18, + dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], + ) or die $graph1->error; + + $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); + + ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014) + if (my $gd1 = $graph1->plot(\@mbias_read1)) { + open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; + binmode MBIAS_G1; + print MBIAS_G1 $gd1->png; + } + else { + warn "WARNING: Cannot generate read 1 M-bias plot: " , $graph1->error , "\n\n"; + } + + # my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error; + # open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; + # binmode MBIAS_G1; + # print MBIAS_G1 $gd1->png; + } + + if ($paired){ + + foreach my $context (qw(CpG CHG CHH)){ + @{$mbias_read2[0]} = (); + + print MBIAS "$context context (R2)\n================\n"; + print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + foreach my $pos (1..$max_length_2){ + + unless (defined $mbias_2{$context}->{$pos}->{meth}){ + $mbias_2{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_2{$context}->{$pos}->{un}){ + $mbias_2{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; + + print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + + push @{$mbias_read2[0]},$pos; + + if ($context eq 'CpG'){ + push @{$mbias_read2[1]},$percent; + push @{$mbias_read2[4]},$coverage; + } + elsif ($context eq 'CHG'){ + push @{$mbias_read2[2]},$percent; + push @{$mbias_read2[5]},$coverage; + } + elsif ($context eq 'CHH'){ + push @{$mbias_read2[3]},$percent; + push @{$mbias_read2[6]},$coverage; + } + } + print MBIAS "\n"; + } + + if ( $gd_graph_installed){ + + add_colour(nice_blue => [31,120,180]); + add_colour(nice_orange => [255,127,0]); + add_colour(nice_green => [51,160,44]); + add_colour(pale_blue => [153,206,227]); + add_colour(pale_orange => [253,204,138]); + add_colour(pale_green => [191,230,207]); + + $graph2->set( + x_label => 'position (bp)', + line_width => 2, + x_max_value => $max_length_1, + x_min_value => 0, + y_tick_number => 10, + y_label_skip => 2, + y1_max_value => 100, + y1_min_value => 0, + y_label_skip => 2, + y2_min_value => 0, + x_label_skip => 5, + x_label_position => 0.5, + x_tick_offset => -1, + bgclr => 'white', + transparent => 0, + two_axes => 1, + use_axis => [1,1,1,2,2,2], + legend_placement => 'RC', + legend_spacing => 6, + legend_marker_width => 24, + legend_marker_height => 18, + dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], + x_label => 'position (bp)', + y1_label => '% methylation', + y2_label => '# calls', + title => 'M-bias (Read 2)', + ) or die $graph2->error; + + $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); + + ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014) + if (my $gd2 = $graph2->plot(\@mbias_read2)) { + open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; + binmode MBIAS_G2; + print MBIAS_G2 $gd2->png; + } + else { + warn "WARNING: Cannot generate Read 2 M-bias plot: " , $graph2->error , "\n\n"; + } + + # my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error; + # open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; + # binmode MBIAS_G2; + # print MBIAS_G2 $gd2->png; + + } + } +} + +sub process_commandline{ + my $help; + my $single_end; + my $paired_end; + my $ignore; + my $ignore_r2; + my $genomic_fasta; + my $full; + my $report; + my $extractor_version; + my $no_overlap; + my $merge_non_CpG; + my $vanilla; + my $output_dir; + my $no_header; + my $bedGraph; + my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status + my $remove; + my $counts; + my $cytosine_report; + my $genome_folder; + my $zero; + my $CpG_only; + my $CX_context; + my $split_by_chromosome; + my $sort_size; + my $samtools_path; + my $gzip; + my $mbias_only; + my $mbias_off; + my $gazillion; + my $ample_mem; + my $include_overlap; + my $ignore_3prime; + my $ignore_3prime_r2; + my $multicore; + + my $command_line = GetOptions ('help|man' => \$help, + 'p|paired-end' => \$paired_end, + 's|single-end' => \$single_end, + 'fasta' => \$genomic_fasta, + 'ignore=i' => \$ignore, + 'ignore_r2=i' => \$ignore_r2, + 'comprehensive' => \$full, + 'report' => \$report, + 'version' => \$extractor_version, + 'no_overlap' => \$no_overlap, + 'merge_non_CpG' => \$merge_non_CpG, + 'vanilla' => \$vanilla, + 'o|output=s' => \$output_dir, + 'no_header' => \$no_header, + 'bedGraph' => \$bedGraph, + "cutoff=i" => \$coverage_threshold, + "remove_spaces" => \$remove, + "counts" => \$counts, + "cytosine_report" => \$cytosine_report, + 'g|genome_folder=s' => \$genome_folder, + "zero_based" => \$zero, + "CX|CX_context" => \$CX_context, + "split_by_chromosome" => \$split_by_chromosome, + "buffer_size=s" => \$sort_size, + 'samtools_path=s' => \$samtools_path, + 'gzip' => \$gzip, + 'mbias_only' => \$mbias_only, + 'mbias_off' => \$mbias_off, + 'gazillion|scaffolds' => \$gazillion, + 'ample_memory' => \$ample_mem, + 'include_overlap' => \$include_overlap, + 'ignore_3prime=i' => \$ignore_3prime, + 'ignore_3prime_r2=i' => \$ignore_3prime_r2, + 'multicore=i' => \$multicore, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($extractor_version){ + print << "VERSION"; + + + Bismark Methylation Extractor + + Bismark Extractor Version: $version + Copyright 2010-15 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + + ### no files provided + unless (@ARGV){ + die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; + } + @filenames = @ARGV; + + warn "\n *** Bismark methylation extractor version $version ***\n\n"; + + ### M-BIAS ONLY + if ($mbias_only){ + + if ($mbias_off){ + die "Options '--mbias_only' and '--mbias_off' are not compatible. Just pick one, mkay?\n"; + } + if ($bedGraph){ + die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n"; + } + if ($cytosine_report){ + die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n"; + } + if ($merge_non_CpG){ + warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n"; + } + if ($full){ + warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n"; + } + sleep(3); + } + + ### PRINT A REPORT + unless ($report){ + $report = 1; # making this the default + } + + ### OUTPUT DIR PATH + if ($output_dir){ + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + else{ + $output_dir = ''; + } + + ### NO HEADER + unless ($no_header){ + $no_header = 0; + } + + ### OLD (VANILLA) OUTPUT FORMAT + unless ($vanilla){ + $vanilla = 0; + } + + if ($single_end){ + $paired_end = 0; ### SINGLE END ALIGNMENTS + } + elsif ($paired_end){ + $single_end = 0; ### PAIRED-END ALIGNMENTS + } + else{ + + ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header + + if ($vanilla){ + die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n"; + } + else{ # SAM/BAM format + + my $file = $filenames[0]; + warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1); + + ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file + if ($file =~ /\.gz$/){ + open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n"; + } + elsif ($file =~ /\.cram$/ || $file =~ /\.bam$/ || isBam($file) ){ ### this would allow to read BAM files that do not end in *.bam + open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM/CRAM file $file: $!\n"; + } + else{ + open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; + } + + while (){ + last unless (/^\@/); + if ($_ =~ /^\@PG/){ + # warn "found the \@PG line:\n"; + # warn "$_"; + + if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){ + warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1); + $paired_end = 1; + $single_end = 0; + } + else{ + warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1); + $paired_end = 0; + $single_end = 1; + } + } + } + + # close DETERMINE or warn $!; # this always throws an error anyway... + + } + } + + ### IGNORING 5' END + # bases at the start of the read when processing the methylation call string + unless ($ignore){ + $ignore = 0; + } + + if (defined $ignore_r2){ + die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end); + } + else{ + $ignore_r2 = 0; + } + + ### IGNORING 3' END + # bases at the end of the read when processing the methylation call string + unless ($ignore_3prime){ + $ignore_3prime = 0; + } + + if (defined $ignore_3prime_r2){ + die "You can only specify --ignore_3prime_r2 for paired-end result files\n" unless ($paired_end); + } + else{ + $ignore_3prime_r2 = 0; + } + + + ### NO OVERLAP + ### --no_overlap is the default (as of version 0.12.6), unless someone explicitly asks to include overlaps + if ($include_overlap){ + die "The option '--include_overlap' can only be specified for paired-end input!\n" unless ($paired_end); + warn "Setting option '--inlcude_overlap' for paired-end data (user-defined)\n\n"; + $no_overlap = 0; + } + else{ # default + if ($paired_end){ + warn "Setting option '--no_overlap' since this is (normally) the right thing to do for paired-end data\n\n"; + $no_overlap = 1; + } + } + + ### COMPREHENSIVE OUTPUT + unless ($full){ + $full = 0; + } + + ### MERGE NON-CpG context + unless ($merge_non_CpG){ + $merge_non_CpG = 0; + } + + ### remove white spaces in read ID (needed for sorting using the sort command + unless ($remove){ + $remove = 0; + } + + ### COVERAGE THRESHOLD FOR bedGraph OUTPUT + if (defined $coverage_threshold){ + unless ($coverage_threshold > 0){ + die "Please select a coverage greater than 0 (positive integers only)\n"; + } + } + else{ + $coverage_threshold = 1; + } + + ### SORT buffer size + if (defined $sort_size){ + unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ + die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; + } + } + else{ + $sort_size = '2G'; + } + + if ($zero){ + die "Option '--zero' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); + } + + if ($CX_context){ + die "Option '--CX_context' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); + } + else{ + $CX_context = 0; + } + + unless ($counts){ + $counts = 1; # counts will always be set + } + + if ($cytosine_report){ + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + } + else{ + die "Please specify a genome folder to proceed (full path only)\n"; + } + + unless ($bedGraph){ + warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; + $bedGraph = 1; + } + unless ($counts){ + # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; + $counts = 1; + } + warn "\n"; + } + + ### PATH TO SAMTOOLS + if (defined $samtools_path){ + # if Samtools was specified as full command + if ($samtools_path =~ /samtools$/){ + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + else{ + unless ($samtools_path =~ /\/$/){ + $samtools_path =~ s/$/\//; + } + $samtools_path .= 'samtools'; + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + } + # Check whether Samtools is in the PATH if no path was supplied by the user + else{ + if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH + $samtools_path = `which samtools`; + chomp $samtools_path; + } + } + + unless (defined $samtools_path){ + $samtools_path = ''; + } + + + if ($gazillion){ + if ($ample_mem){ + die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; + } + } + + if (defined $multicore){ + unless ($multicore > 0){ + die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n"; + } + if ($multicore > 20){ + warn "Core usage currently set to more than 20 threads. Let's see how this goes... (set value: $multicore)\n\n"; + } + } + else{ + $multicore = 1; # default. Single-thread mode + warn "Setting core usage to single-threaded (default). Consider using --multicore to speed up the extraction process.\n\n"; + } + + return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore); +} + + +sub test_positional_sorting{ + + my $filename = shift; + + print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; + sleep(1); + + if ($filename =~ /\.gz$/) { + open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/ || $filename =~ /cram$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam + if ($samtools_path){ + open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM/CRAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (TEST,$filename) or die "Can't open file $filename: $!\n"; + } + + my $count = 0; + + while () { + if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) + if (/^\@SO/) { + die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n"; + } + next; + } + $count++; + + last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID + + my ($id_1) = (split (/\t/)); + + ### reading the next line which should be read 2 + $_ = ; + my ($id_2) = (split (/\t/)); + last unless ($id_2); + ++$count; + + if ($id_1 eq $id_2){ + ### ids are the same + next; + } + else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first + my $id_1_trunc = $id_1; + $id_1_trunc =~ s/\/1$//; + my $id_2_trunc = $id_2; + $id_2_trunc =~ s/\/2$//; + + unless ($id_1_trunc eq $id_2_trunc){ + die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n"; + } + } + } + # close TEST or die $!; somehow fails on our cluster... + ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) + warn "...passed!\n"; + sleep(1); + +} + +sub process_Bismark_results_file{ + + my $filename = shift; + my $report_filename = open_output_filehandles($filename); + + ### disabling buffering so we don't run into problems with half written out lines... + foreach my $fh (keys %fhs){ + if ($fh =~ /^[1230]$/) { + foreach my $context (keys %{$fhs{$fh}}) { + select($fhs{$fh}->{$context}); + $|++; + } + } + else{ + select($fhs{$fh}); + $|++; + } + } + select(STDOUT); + + ################################################ + ################################################ + ### multi-process handling + ### + + my $offset = 1; + my $process_id; + my @pids; + if ($multicore > 1){ + + until ($offset == $multicore){ + # warn "multicore: $multicore\noffset: $offset\n"; + my $fork = fork; + + if (defined $fork){ + if ($fork != 0){ + $process_id = $fork; + push @pids, $process_id; + if ($offset < $multicore){ + ++$offset; + # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n"; + } + else{ + # warn "Reached the number of maximum multicores. Proceeeding to processing...\n"; + } + } + elsif ($fork == 0){ + # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n"; + $process_id = $fork; + last; + } + } + else{ + die "Forking unsuccessful. Proceeding using a single thread only\n"; + } + } + + # warn "\nThe thread identity\n===================\n"; + if ($process_id){ + # print "I am the parent process. My children are called:\n"; + # print join ("\t",@pids),"\n"; + # print "I am going to process the following line count: $offset\n\n"; + } + elsif($process_id == 0){ + # warn "I am a child process: Process ID: $process_id\n"; + # warn "I am going to process the following line count: $offset\n\n"; + } + else{ + die "Process ID was: $process_id\n"; + } + } + else{ + # warn "Single-core mode: setting pid to 1\n"; + $process_id = 1; + } + + ################################################ + ################################################ + if ($process_id){ + warn "Now reading in Bismark result file $filename\n"; + } + else{ + warn "\nNow reading in Bismark result file $filename\n\n"; + } + + if ($filename =~ /\.gz$/) { + open (IN,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/ || $filename =~ /cram$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam + if ($samtools_path){ + open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM/CRAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (IN,$filename) or die "Can't open file $filename: $!\n"; + } + + ### Vanilla and SAM output need to read different numbers of header lines + if ($vanilla) { + my $bismark_version = ; ## discarding the Bismark version info + chomp $bismark_version; + $bismark_version =~ s/\r//; # replaces \r line feed + $bismark_version =~ s/Bismark version: //; + if ($bismark_version =~ /^\@/) { + warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; + sleep (1); + } + + unless ($version eq $bismark_version){ + die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; + } + } else { + # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. + # We are reading from it further down + } + + my $methylation_call_strings_processed = 0; + my $line_count = 0; + + ### proceeding differently now for single-end or paired-end Bismark files + + ### PROCESSING SINGLE-END RESULT FILES + if ($single) { + + ### also proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark output format + while () { + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or + ### last position + chomp $genome_conversion; + + my $index; + if ($meth_call) { + + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### Clipping off the first number of bases from the methylation call string as specified with --ignore + if ($ignore) { + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + $start += $ignore; + } + elsif ($strand eq '-') { + $start += length($meth_call)-1; ## $meth_call is already shortened! + } + else { + die "Alignment did not have proper strand information: $strand\n"; + } + } + + ### Clipping off the last number of bases from the methylation call string as specified with --ignore_3prime + if ($ignore_3prime) { + + $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime); + + ### If we are clipping off some bases at the end we need to adjust the end position of the alignments accordingly + if ($strand eq '+') { + # clipping the 3' end does not affect the starting position # ignore 5' has already been taken care of, if relevant at all + } + elsif ($strand eq '-') { + # here we need to discriminate if the start has been adjusted because of --ignore or not + if ($ignore){ + # position adjusted already, and because of this 3' trimming is irrelevant for the start position + } + else{ + # Here we need to add the length ignore_3prime to the read starting position, adjustment of the start position will take place later in the methylation extraction step + $start += $ignore_3prime; + } + } + else { + die "Alignment did not have proper strand information: $strand\n"; + } + } + ### just as a comment, if --ignore has not been specified the starting position of reverse reads is adjusted later at the methylation extraction stage + + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } else { # processing single-end SAM format (default) + while () { + chomp; + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); + next; + } + + ++$line_count; + # warn "$line_count\n"; + warn "Processed lines: $line_count\n" if ($line_count%500000 == 0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + + # example read in SAM format + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + ### + + # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + # < 0.7.6 $meth_call =~ s/^XM:Z://; + # < 0.7.6 $read_conversion =~ s/^XR:Z://; + # < 0.7.6 $genome_conversion =~ s/^XG:Z://; + + my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; + + ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call; ### Thanks to Zachary Zeno for this solution + my $read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call = $value; + $meth_call =~ s/\r//; + } elsif ($tag eq "XR") { + $read_conversion = $value; + $read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + my $strand; + # warn "$meth_call\n$read_conversion\n$genome_conversion\n"; + + my $index; + if ($meth_call) { + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + $strand = '+'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + $strand = '-'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + $strand = '+'; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output + if ($strand eq '-') { + $meth_call = reverse $meth_call; + } + # warn "\n$meth_call\n"; + + ### IGNORE 5 PRIME: Clipping off the first number of bases from the methylation call string as specified with --ignore + if ($ignore) { + # warn "\n\n$meth_call\n"; + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + # warn "$meth_call\n";sleep(1); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my @comp_cigar; # building an array with all CIGAR operations + foreach my $index (0..$#len) { + foreach (1..$len[$index]) { + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # print "original CIGAR: $cigar\n"; + # print "original CIGAR: @comp_cigar\n"; + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + + my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions + my $I_count = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count++; + $op = shift @comp_cigar; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count++; + } + } + $start += $ignore + $D_count - $I_count; + # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; + } + elsif ($strand eq '-') { + + for (1..$ignore) { + my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar; + } + } + + ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR + ### string to be able to work out the starting position of the read which is on the 3' end of the sequence + my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position + foreach (@comp_cigar) { + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); + } + $start += $MD_count - 1; + } + + ### reconstituting shortened CIGAR string + my $new_cigar; + my $count = 0; + my $last_op; + # print "ignore adjusted: @comp_cigar\n"; + foreach my $op (@comp_cigar) { + unless (defined $last_op){ + $last_op = $op; + ++$count; + next; + } + if ($last_op eq $op) { + ++$count; + } else { + $new_cigar .= "$count$last_op"; + $last_op = $op; + $count = 1; + } + } + $new_cigar .= "$count$last_op"; # appending the last operation and count + $cigar = $new_cigar; + # print "ignore adjusted scalar: $cigar\n"; + } + + ####################### + ### INGORE 3' END ### + ####################### + + # Clipping off the last number of bases from the methylation call string as specified with --ignore_3prime + if ($ignore_3prime) { + # warn "$meth_call\n"; + $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime); + # warn "$meth_call\n";sleep(1); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my @comp_cigar; # building an array with all CIGAR operations + foreach my $index (0..$#len) { + foreach (1..$len[$index]) { + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + + # print "original CIGAR: $cigar\n"; + # print join ("",@comp_cigar),"\n"; + + ### If we are clipping off some bases at the end we might have to adjust the start position of the alignments accordingly + if ($strand eq '+') { + + ### clipping the 3' end does not affect the starting position of forward strand alignments + # ignore 5' has already been taken care of at this stage, if relevant at all + + for (1..$ignore_3prime) { + my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations from the end + # print join ("",@comp_cigar),"\n"; + # print "$_ deleted $op from 3' end\n"; + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar; + # print join ("",@comp_cigar),"\n"; + # print "$_ deleted $op from 3' end\n"; + } + } + # print "Final truncated CIGAR string:\n"; + # print join ("",@comp_cigar),"\n"; + # $start += $ignore + $D_count - $I_count; + # print "start $start\t ignore_3prime: $ignore_3prime\t D count: $D_count I_count: $I_count\n"; + } + elsif ($strand eq '-') { + + my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions + my $I_count = 0; + + for (1..$ignore_3prime) { + my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $D_count++; + $op = shift @comp_cigar; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count++; + } + + } + + # here we need to discriminate if the start has been adjusted because of --ignore or not + if ($ignore){ + # the start position has already been modified for --ignore already, so we don't have to adjust the position again now + } + else{ + # Here we need to add the length ignore_3prime to the read starting position + # adjustment of the true starting position of this reverse read will take place later in the methylation extraction step + $start += $ignore_3prime + $D_count - $I_count; + } + + } + + ### reconstituting shortened CIGAR string + my $new_cigar; + my $count = 0; + my $last_op; + # print "ignore_3prime adjusted:\n"; print join ("",@comp_cigar),"\n"; + foreach my $op (@comp_cigar) { + unless (defined $last_op){ + $last_op = $op; + ++$count; + next; + } + if ($last_op eq $op) { + ++$count; + } else { + $new_cigar .= "$count$last_op"; + $last_op = $op; + $count = 1; + } + } + $new_cigar .= "$count$last_op"; # appending the last operation and count + $cigar = $new_cigar; + # print "ignore_3prime adjusted scalar: $cigar\n"; + } + } + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } + + ### PROCESSING PAIRED-END RESULT FILES + elsif ($paired) { + + ### proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark paired-end output format + while () { + ++$line_count; + warn "processed line: $line_count\n" if ($line_count%500000 == 0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; + + my $index; + chomp $genome_conversion; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB!!! + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT!!! + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + if ($meth_call_1 and $meth_call_2) { + ### Clipping off the first number of bases from the methylation call strings as specified with '--ignore ' + + ### IGNORE FROM 5' END + if ($ignore) { + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified + $start_read_1 += $ignore; + } + if ($ignore_r2) { + $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified + $end_read_2 -= $ignore_r2; + } + + ### IGNORE 3' END + + ### Clipping off the last number of bases from the methylation call string of Read 1 as specified with --ignore_3prime + if ($ignore_3prime) { + $meth_call_1 = substr($meth_call_1,0, length($meth_call_1) - $ignore_3prime); + # we don't have to adjust the position now since the shortened methylation call will be fine, see below + } + ### Clipping off the last number of bases from the methylation call string of Read 2 as specified with --ignore_3prime_r2 + if ($ignore_3prime_r2) { + $meth_call_2 = substr($meth_call_2,0, length($meth_call_2) - $ignore_3prime_r2); + # we don't have to adjust the position now since the shortened methylation call will be fine, see below + } + + my $end_read_1; + my $start_read_2; + + if ($strand eq '+') { + + $end_read_1 = $start_read_1 + length($meth_call_1) - 1; + $start_read_2 = $end_read_2 - length($meth_call_2) + 1; + + ## we first pass the first read which is in + orientation on the forward strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2); + } + else { + + $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! + $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! + + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + } + } + else { # Bismark paired-end BAM/SAM output format (default) + while () { + chomp; + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); # no additional warnings for child procesess + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + $_ = ; # reading and skipping another line since this is the paired-end read + next; + } + + # example paired-end reads in SAM format (2 consecutive lines) + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT + + my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call_1; + my $first_read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_1 = $value; + $meth_call_1 =~ s/\r//; + } elsif ($tag eq "XR") { + $first_read_conversion = $value; + $first_read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + $_ = ; # reading in the paired read + chomp; + + my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + + my $meth_call_2; + my $second_read_conversion; + + while ( /(XM|XR):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_2 = $value; + $meth_call_2 =~ s/\r//; + } elsif ($tag eq "XR") { + $second_read_conversion = $value; + $second_read_conversion = s/\r//; + } + } + + # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; + # chomp $genome_conversion; # in case it captured a new line character # already removed + + # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; + + my $index; + my $strand; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + $strand = '+'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT + $strand = '-'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB + $strand = '+'; + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + ### reversing the methylation call of the read that was reverse-complemented + if ($strand eq '+') { + $meth_call_2 = reverse $meth_call_2; + } else { + $meth_call_1 = reverse $meth_call_1; + } + # warn "\n$meth_call_1\n$meth_call_2\n"; + + if ($meth_call_1 and $meth_call_2) { + + my $end_read_1; + + ### READ 1 + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + + die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1); + + my @comp_cigar_1; # building an array with all CIGAR operations + foreach my $index (0..$#len_1) { + foreach (1..$len_1[$index]) { + # print "$ops_1[$index]"; + push @comp_cigar_1, $ops_1[$index]; + } + } + # print "original CIGAR read 1: $cigar_1\n"; + # print "original CIGAR read 1: @comp_cigar_1\n"; + + ### READ 2 + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + my @comp_cigar_2; # building an array with all CIGAR operations for read 2 + foreach my $index (0..$#len_2) { + foreach (1..$len_2[$index]) { + # print "$ops_2[$index]"; + push @comp_cigar_2, $ops_2[$index]; + } + } + # print "original CIGAR read 2: $cigar_2\n"; + # print "original CIGAR read 2: @comp_cigar_2\n"; + + ################################## + ### IGNORE BASES FROM 5' END ### + ################################## + + if ($ignore) { + ### Clipping off the first number of bases from the methylation call string as specified with '--ignore ' for read 1 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_1) - $ignore) <= 0){ + # next; # skipping this read entirely if the read is shorter than the portion to be ignored + $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start + my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions + my $I_count_1 = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_1++; + $op = shift @comp_cigar_1; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_1++; + } + } + + $start_read_1 += $ignore + $D_count_1 - $I_count_1; + # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; + + # the start position of reads mapping to the reverse strand is being adjusted further below + } + elsif ($strand eq '-') { + + ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back + for (1..$ignore) { + my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_1; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + + } + } + } + + if ($ignore_r2) { + ### Clipping off the first number of bases from the methylation call string as specified with '--ignore_r2 ' for read 2 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_2) - $ignore_r2) <= 0){ + # next; # skipping this read entirely if the read is shorter than the portion to be ignored # this would skip the entire read pair! + $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back + + for (1..$ignore_r2) { + my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_2; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + } + elsif ($strand eq '-') { + + ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start + my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions + my $I_count_2 = 0; + + for (1..$ignore_r2) { + my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_2++; + $op = shift @comp_cigar_2; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_2++; + } + } + + $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2; + # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; + } + } + } + + if ($ignore and $meth_call_1){ # if the methylation call string is undefined at this position we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 1 + my $new_cigar_1; + my $count_1 = 0; + my $last_op_1; + # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; + foreach my $op (@comp_cigar_1) { + unless (defined $last_op_1){ + $last_op_1 = $op; + ++$count_1; + next; + } + if ($last_op_1 eq $op) { + ++$count_1; + } else { + $new_cigar_1 .= "$count_1$last_op_1"; + $last_op_1 = $op; + $count_1 = 1; + } + } + $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count + $cigar_1 = $new_cigar_1; + # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; + } + + if ($ignore_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 2 + my $new_cigar_2; + my $count_2 = 0; + my $last_op_2; + # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; + foreach my $op (@comp_cigar_2) { + unless (defined $last_op_2){ + $last_op_2 = $op; + ++$count_2; + next; + } + if ($last_op_2 eq $op) { + ++$count_2; + } + else { + $new_cigar_2 .= "$count_2$last_op_2"; + $last_op_2 = $op; + $count_2 = 1; + } + } + $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count + $cigar_2 = $new_cigar_2; + # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; + } + + ########################### + ### END IGNORE 5' END ### + ########################### + + ################################## + ### IGNORE BASES FROM 3' END ### + ################################## + + # print "CIGAR string before truncating 3' end (Read 1)\n"; + # print join ("",@comp_cigar_1),"\n"; + + if ($ignore_3prime and $meth_call_1) { # if the methylation call string is undefined at this point we don't need to process the read any further + + ### Clipping off the last number of bases from the methylation call string as specified with '--ignore_3prime ' for read 1 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_1) - $ignore_3prime) <= 0){ + $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_1 = substr($meth_call_1,0,length($meth_call_1) - $ignore_3prime); + # warn "truncated meth_call 1:\n$meth_call_1\n"; + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', clipping the 3' end does not affect the starting position of forward strand alignments + # ignore 5' has already been taken care of at this stage, if relevant at all + + for (1..$ignore_3prime) { + my $op = pop @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore_3prime operations from the end + # print "$_ deleted $op from 3' end\n"; + # print join ("",@comp_cigar_1),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_1; + # print join ("",@comp_cigar_1),"\n"; + # print "$_ deleted $op from 3' end\n"; + } + } + + # print "Final truncated CIGAR string (Read 1):\n"; + # print join ("",@comp_cigar_1),"\n"; + + } + elsif ($strand eq '-') { + + my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions + my $I_count_1 = 0; + + ### if the (read 1) strand information is '-', the read 1 CIGAR string needs to be trimmed from the start + for (1..$ignore_3prime) { + my $op = shift @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array + # print join ("",@comp_cigar_1),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_1++; + $op = shift @comp_cigar_1; + # print join ("",@comp_cigar_1),"\n"; + } + + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_1++; + } + } + + # print "Final truncated CIGAR string reverse_read:\n"; + # print join ("",@comp_cigar_1),"\n"; + + # Here we need to add the length ignore_3prime to the read starting position + # adjustment of the true start position of this reverse read will take place later in the methylation extraction step + $start_read_1 += $ignore_3prime + $D_count_1 - $I_count_1; + + } + } + } + + if ($ignore_3prime_r2 and $meth_call_2) { # if the methylation call string is undefined at this point we don't need to process the read any further + + ### Clipping off the last number of bases from the methylation call string as specified with '--ignore_3prime_r2 ' for read 2 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_2) - $ignore_3prime_r2) <= 0){ + $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_2 = substr($meth_call_2,0,length($meth_call_2) - $ignore_3prime_r2); + # warn "truncated meth_call 2:\n$meth_call_2\n"; + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string and the positions accordingly + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', clipping the 3' end of read 2 does potentially affect the starting position of read 2 (reverse strand alignment) + ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the start + + my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions + my $I_count_2 = 0; + + for (1..$ignore_3prime_r2) { + my $op = shift @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the first value of the array + # print "$_ deleted $op from 3' end\n"; + # print join ("",@comp_cigar_2),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_2++; + $op = shift @comp_cigar_2; + # print "$_ deleted $op from 3' end\n"; + # print join ("",@comp_cigar_2),"\n"; + } + + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_2++; + } + } + + # print "Final truncated CIGAR string 2 (+ alignment):\n"; + # print join ("",@comp_cigar_2),"\n"; + + # Here we need to add the length ignore_3prime_r2 to the read starting position + # adjustment of the true start position of this reverse read will take place later in the methylation extraction step + $start_read_2 += $ignore_3prime_r2 + $D_count_2 - $I_count_2; + + # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; + } + elsif ($strand eq '-') { + ### if the (read 1) strand information is '-', clipping the 3' end of read 2 does not affect its starting position (forward strand alignment) + ### ignore_r2 5' has already been taken care of at this stage, if relevant at all + + ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the end + + for (1..$ignore_3prime_r2) { + my $op = pop @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + # print join ("",@comp_cigar_2),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_2; + # print "$_ deleted $op\n"; + # print join ("",@comp_cigar_2),"\n"; + } + } + + # print "Final truncated CIGAR string 2 (- alignment):\n"; + # print join ("",@comp_cigar_2),"\n"; + + } + } + } + + if ($ignore_3prime and $meth_call_1){ # if the methylation call string is undefined at this point we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 1 + my $new_cigar_1; + my $count_1 = 0; + my $last_op_1; + # print "ignore_3prime adjusted CIGAR 1: @comp_cigar_1\n"; + foreach my $op (@comp_cigar_1) { + unless (defined $last_op_1){ + $last_op_1 = $op; + ++$count_1; + next; + } + if ($last_op_1 eq $op) { + ++$count_1; + } + else { + $new_cigar_1 .= "$count_1$last_op_1"; + $last_op_1 = $op; + $count_1 = 1; + } + } + $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count + $cigar_1 = $new_cigar_1; + # warn "ignore_3prime adjusted CIGAR 1 scalar: $cigar_1\n"; + } + + if ($ignore_3prime_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 2 + my $new_cigar_2; + my $count_2 = 0; + my $last_op_2; + # print "ignore_3prime_r2 adjusted CIGAR 2: @comp_cigar_2\n"; + foreach my $op (@comp_cigar_2) { + unless (defined $last_op_2){ + $last_op_2 = $op; + ++$count_2; + next; + } + if ($last_op_2 eq $op) { + ++$count_2; + } + else { + $new_cigar_2 .= "$count_2$last_op_2"; + $last_op_2 = $op; + $count_2 = 1; + } + } + $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count + $cigar_2 = $new_cigar_2; + # warn "ignore_3prime_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; + } + + ########################### + ### END IGNORE 3' END ### + ########################### + + + ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on + + if ($strand eq '+') { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 + @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_2\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + my $MD_count_2 = 0; + foreach (@comp_cigar_2) { + ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1 + $MD_count_1 - 1; + $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand + } + else { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 + + @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_1\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1; + $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand + } + + if ($strand eq '+') { + ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1); + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2); + } + else { + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation call strings + } + } + } + } else { + die "Single-end or paired-end reads not specified properly\n"; + } + + $counting{sequences_count} = $line_count; + $counting{methylation_call_strings} = $methylation_call_strings_processed; + + if ($multicore == 1){ + print_splitting_report (); + } + elsif ($multicore > 1){ + print_splitting_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed); + print_mbias_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed); + } + + return ($process_id,\@pids,$report_filename); + +} + + + +sub print_splitting_report{ + + ### Calculating methylation percentages if applicable + warn "\nProcessed $counting{sequences_count} lines in total\n"; + warn "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n"; + if ($report) { + print REPORT "\nProcessed $counting{sequences_count} lines in total\n"; + print REPORT "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n"; + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_non_CpG_methylation; + if ($merge_non_CpG){ + if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); + } + } + + if ($report){ + ### detailed information about Cs analysed + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### calculating methylated CpG percentage if applicable + if ($percent_meCpG){ + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3 Context Output + else{ + ### calculating methylated CHG percentage if applicable + if ($percent_meCHG){ + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### calculating methylated CHH percentage if applicable + if ($percent_meCHH){ + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } + } + + ### detailed information about Cs analysed for on-screen report + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn"Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn"Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + warn "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3-Context Output + else{ + ### printing methylated CHG percentage if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated CHH percentage if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } +} + +### + +sub print_splitting_report_multicore{ + + my ($report_filename,$offset,$line_count,$meth_call_strings) = @_; + + # warn "\$report_filename is $report_filename\n"; + my $special_report = $report_filename.".$offset"; + + open (SPECIAL_REPORT,'>',$special_report) or die $!; + # warn "line count\t$line_count\n"; + # warn "meth call strings\t$meth_call_strings\n"; + + print SPECIAL_REPORT "line count\t$line_count\n"; + print SPECIAL_REPORT "meth call strings\t$meth_call_strings\n"; + + ### Calculating methylation percentages if applicable + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_non_CpG_methylation; + if ($merge_non_CpG){ + if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); + } + } + + if ($report){ + + ### detailed information about Cs analysed + print SPECIAL_REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print SPECIAL_REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print SPECIAL_REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print SPECIAL_REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print SPECIAL_REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print SPECIAL_REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print SPECIAL_REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print SPECIAL_REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### calculating methylated CpG percentage if applicable + if ($percent_meCpG){ + print SPECIAL_REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print SPECIAL_REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3 Context Output + else{ + ### calculating methylated CHG percentage if applicable + if ($percent_meCHG){ + print SPECIAL_REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### calculating methylated CHH percentage if applicable + if ($percent_meCHH){ + print SPECIAL_REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } + } + close SPECIAL_REPORT or warn "Failed to close filehandle for individual report $special_report\n"; +} + + +### + +### INDIVIDUAL M-BIAS REPORTS + +sub print_mbias_report_multicore{ + + my ($report_filename,$offset,$line_count,$meth_call_strings) = @_; + + # warn "\$report_filename is $report_filename\n"; + my $special_mbias_report = $report_filename.".${offset}.mbias"; + + open (SPECIAL_MBIAS,'>',$special_mbias_report) or die $!; + + # determining maximum read length + my $max_length_1 = 0; + my $max_length_2 = 0; + + foreach my $context (keys %mbias_1){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ + $max_length_1 = $pos unless ($max_length_1 >= $pos); + } + } + if ($paired){ + foreach my $context (keys %mbias_2){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ + $max_length_2 = $pos unless ($max_length_2 >= $pos); + } + } + } + + if ($single){ + # warn "Determining maximum read length for M-Bias plot\n"; + # warn "Maximum read length of Read 1: $max_length_1\n\n"; + } + else{ + # warn "Determining maximum read lengths for M-Bias plots\n"; + # warn "Maximum read length of Read 1: $max_length_1\n"; + # warn "Maximum read length of Read 2: $max_length_2\n\n"; + } + + foreach my $context (qw(CpG CHG CHH)){ + + if ($paired){ + print SPECIAL_MBIAS "$context context (R1)\n================\n"; + } + else{ + print SPECIAL_MBIAS "$context context\n===========\n"; + } + print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_1){ + + unless (defined $mbias_1{$context}->{$pos}->{meth}){ + $mbias_1{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_1{$context}->{$pos}->{un}){ + $mbias_1{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; + + print SPECIAL_MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + } + print SPECIAL_MBIAS "\n"; + } + + if ($paired){ + + foreach my $context (qw(CpG CHG CHH)){ + + print SPECIAL_MBIAS "$context context (R2)\n================\n"; + print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_2){ + + unless (defined $mbias_2{$context}->{$pos}->{meth}){ + $mbias_2{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_2{$context}->{$pos}->{un}){ + $mbias_2{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; + + print SPECIAL_MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + } + } + } + + close SPECIAL_MBIAS or warn "Failed to close filehandle for individual M-bias report $special_mbias_report\n"; +} + + +### + + +sub print_individual_C_methylation_states_paired_end_files{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_; + + unless (defined $meth_call) { + return; # skip this read + } + + ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2 + die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2); + + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ### U for methylated C in Unknown context (was protected) ### + ### u for not methylated C in Unknown context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + my @comp_cigar; + + ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing + if ($cigar =~ /^\d+M$/){ + # this check speeds up the extraction process by up to 60%!!! + } + else{ # parsing CIGAR string + my @len; + my @ops; + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + if ($strand eq '-') { + + ### the CIGAR string needs to be reversed, the methylation call has already been reversed above + if (@comp_cigar){ + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + } + # print "reverse CIGAR string: @comp_cigar\n"; + + ### the start position of paired-end files has already been corrected, see above + } + + ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified + + if ($merge_non_CpG) { + if ($no_overlap) { # this has to be read 2... + + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.'){} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } + elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } else { + die "The read orientation was neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation was neither + nor -: '$strand'/n"; + } + } + } + + ### this is the default paired-end procedure allowing overlaps and using every single C position + ### Still within the 2-CONTEXT ONLY optional section + else { + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + ### still within the 2-CONTEXT optional section + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + } + + ############################################ + ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### + ############################################ + + elsif ($no_overlap) { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + + ### this is the paired-end procedure allowing overlaps and using every single C position + else { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } +} + +sub check_cigar_string { + my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; + # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; + my ($new_cigar_offset,$new_pos_offset) = (0,0); + + if ($strand eq '+') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by -1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by another -1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another +1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + + elsif ($strand eq '-') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position + # warn "adjusted genomic position by +1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "Found new 'M' operation; position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position + # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another -1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; + return ($new_cigar_offset,$new_pos_offset); +} + +sub print_individual_C_methylation_states_single_end{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + + my @comp_cigar; + + if ($cigar){ # parsing CIGAR string + + ### Checking whether the CIGAR string is a linear genomic match or whether it requires indel processing + if ($cigar =~ /^\d+M$/){ + # warn "See!? I told you so! $cigar\n"; + # sleep(1); + } + else{ + + my @len; + my @ops; + + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops); + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + ### adjusting the start position for all reads mapping to the reverse strand + if ($strand eq '-') { + + if (@comp_cigar){ # only needed for SAM reads with InDels + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print @comp_cigar,"\n"; + } + + unless ($ignore){ ### if --ignore was specified the start position has already been corrected + + if ($cigar){ ### SAM format + if ($cigar =~ /^(\d+)M$/){ # linear match + $start += $1 - 1; + } + else{ # InDel read + my $MD_count = 0; + foreach (@comp_cigar){ + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + $start += $MD_count - 1; + } + } + else{ ### vanilla format + $start += length($meth_call)-1; + } + } + } + + ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) + + ### single-file CpG and other-context output + if ($full and $merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + # print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + # print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + #print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + #print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.'){} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + elsif ($merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION + + elsif ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The read had a strand orientation which was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } +} + +sub open_output_filehandles{ + + my $filename = shift; + + my $output_filename = (split (/\//,$filename))[-1]; + my $report_filename = $output_filename; + + ### OPENING OUTPUT-FILEHANDLES + if ($report) { + $report_filename =~ s/\.sam$//; + $report_filename =~ s/\.bam$//; + $report_filename =~ s/\.cram$//; + $report_filename =~ s/\.txt$//; + $report_filename =~ s/$/_splitting_report.txt/; + $report_filename = $output_dir . $report_filename; + open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; + } + + if ($report) { + + print REPORT "$output_filename\n\n"; + print REPORT "Parameters used to extract methylation information:\n"; + print REPORT "Bismark Extractor Version: $version\n"; + + if ($paired) { + if ($vanilla) { + print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: paired-end (SAM format)\n"; # default + } + } + + if ($single) { + if ($vanilla) { + print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: single-end (SAM format)\n"; # default + } + } + if ($single){ + if ($ignore) { + print REPORT "Ignoring first $ignore bp\n"; + } + if ($ignore_3prime) { + print REPORT "Ignoring last $ignore_3prime bp\n"; + } + } + else{ # paired-end + if ($ignore) { + print REPORT "Ignoring first $ignore bp of Read 1\n"; + } + if ($ignore_r2){ + print REPORT "Ignoring first $ignore_r2 bp of Read 2\n"; + } + + if ($ignore_3prime) { + print REPORT "Ignoring last $ignore_3prime bp of Read 1\n"; + } + if ($ignore_3prime_r2){ + print REPORT "Ignoring last $ignore_3prime_r2 bp of Read 2\n"; + } + + } + + if ($full) { + print REPORT "Output specified: comprehensive\n"; + } else { + print REPORT "Output specified: strand-specific (default)\n"; + } + + if ($no_overlap) { + print REPORT "No overlapping methylation calls specified\n"; + } + if ($genomic_fasta) { + print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; + } + if ($merge_non_CpG) { + print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; + } + + print REPORT "\n"; + } + + ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + + ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL + ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file + if ($full and $merge_non_CpG) { + my $cpg_output = my $other_c_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/cram$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + else{ ### disclaimer: I am aware of "The Useless Use of Cat Awards", but I saw no other option... + open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); + push @sorting_files,$cpg_output; + + unless ($no_header) { + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in any other context than CpG + $other_c_output =~ s/^/Non_CpG_context_/; + $other_c_output =~ s/sam$/txt/; + $other_c_output =~ s/bam$/txt/; + $other_c_output =~ s/cram$/txt/; + $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); + $other_c_output = $output_dir . $other_c_output; + + if ($gzip){ + $other_c_output .= '.gz'; + open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only); + } + else{ + open ($fhs{other_context},"| cat > $other_c_output") or die "Failed to write to $other_c_output $!\n" unless($mbias_only); + # open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only); + push @sorting_files,$other_c_output; + + + unless ($no_header) { + print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + + ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found + elsif ($merge_non_CpG) { + + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/cram$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/cram$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/cram$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/cram$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in Non-CpG (CC, CT or CA) context + my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; + + $other_c_ot =~ s/^/Non_CpG_OT_/; + $other_c_ot =~ s/sam$/txt/; + $other_c_ot =~ s/bam$/txt/; + $other_c_ot =~ s/cram$/txt/; + $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); + $other_c_ot = $output_dir . $other_c_ot; + + if ($gzip){ + $other_c_ot .= '.gz'; + open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{other_c},"| cat > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only); + push @sorting_files,$other_c_ot; + + unless($no_header){ + print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ctot =~ s/^/Non_CpG_CTOT_/; + $other_c_ctot =~ s/sam$/txt/; + $other_c_ctot =~ s/bam$/txt/; + $other_c_ctot =~ s/cram$/txt/; + $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); + $other_c_ctot = $output_dir . $other_c_ctot; + + if ($gzip){ + $other_c_ctot .= '.gz'; + open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{other_c},"| cat > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only); + push @sorting_files,$other_c_ctot; + + unless($no_header){ + print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ctob =~ s/^/Non_CpG_CTOB_/; + $other_c_ctob =~ s/sam$/txt/; + $other_c_ctob =~ s/bam$/txt/; + $other_c_ctob =~ s/cram$/txt/; + $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); + $other_c_ctob = $output_dir . $other_c_ctob; + + if ($gzip){ + $other_c_ctob .= '.gz'; + open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{other_c},"| cat > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only); + push @sorting_files,$other_c_ctob; + + unless($no_header){ + print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ob =~ s/^/Non_CpG_OB_/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/cram$/txt/; + $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); + $other_c_ob = $output_dir . $other_c_ob; + + if ($gzip){ + $other_c_ob .= '.gz'; + open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{other_c},"| cat > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only); + push @sorting_files,$other_c_ob; + + unless($no_header){ + print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + + ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) + + ### if --comprehensive was specified we are only writing one file per context + elsif ($full) { + my $cpg_output = my $chg_output = my $chh_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/cram$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + else{ + open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); + push @sorting_files,$cpg_output; + + unless($no_header){ + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in CHG context + $chg_output =~ s/^/CHG_context_/; + $chg_output =~ s/sam$/txt/; + $chg_output =~ s/bam$/txt/; + $chg_output =~ s/cram$/txt/; + $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); + $chg_output = $output_dir . $chg_output; + + if ($gzip){ + $chg_output .= '.gz'; + open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); + } + else{ + open ($fhs{CHG_context},"| cat > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); + # open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only); + push @sorting_files,$chg_output; + + unless($no_header){ + print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in CHH context + $chh_output =~ s/^/CHH_context_/; + $chh_output =~ s/sam$/txt/; + $chh_output =~ s/bam$/txt/; + $chh_output =~ s/cram$/txt/; + $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); + $chh_output = $output_dir . $chh_output; + + if ($gzip){ + $chh_output .= '.gz'; + open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); + } + else{ + open ($fhs{CHH_context},"| cat > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); + # open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only); + push @sorting_files, $chh_output; + + unless($no_header){ + print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found + else { + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/cram$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/cram$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/cram$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/cram$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in CHG context + my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; + + $chg_ot =~ s/^/CHG_OT_/; + $chg_ot =~ s/sam$/txt/; + $chg_ot =~ s/bam$/txt/; + $chg_ot =~ s/cram$/txt/; + $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); + $chg_ot = $output_dir . $chg_ot; + + if ($gzip){ + $chg_ot .= '.gz'; + open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CHG},"| cat > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only); + push @sorting_files,$chg_ot; + + unless($no_header){ + print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ctot =~ s/^/CHG_CTOT_/; + $chg_ctot =~ s/sam$/txt/; + $chg_ctot =~ s/bam$/txt/; + $chg_ctot =~ s/cram$/txt/; + $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); + $chg_ctot = $output_dir . $chg_ctot; + + if ($gzip){ + $chg_ctot .= '.gz'; + open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CHG},"| cat > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only); + push @sorting_files,$chg_ctot; + + unless($no_header){ + print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ctob =~ s/^/CHG_CTOB_/; + $chg_ctob =~ s/sam$/txt/; + $chg_ctob =~ s/bam$/txt/; + $chg_ctob =~ s/cram$/txt/; + $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); + $chg_ctob = $output_dir . $chg_ctob; + + if ($gzip){ + $chg_ctob .= '.gz'; + open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CHG},"| cat > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only); + push @sorting_files,$chg_ctob; + + unless($no_header){ + print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ob =~ s/^/CHG_OB_/; + $chg_ob =~ s/sam$/txt/; + $chg_ob =~ s/bam$/txt/; + $chg_ob =~ s/cram$/txt/; + $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); + $chg_ob = $output_dir . $chg_ob; + + if ($gzip){ + $chg_ob .= '.gz'; + open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CHG},"| cat > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only); + push @sorting_files,$chg_ob; + + unless($no_header){ + print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in CHH context + my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; + + $chh_ot =~ s/^/CHH_OT_/; + $chh_ot =~ s/sam$/txt/; + $chh_ot =~ s/bam$/txt/; + $chh_ot =~ s/cram$/txt/; + $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); + $chh_ot = $output_dir . $chh_ot; + + if ($gzip){ + $chh_ot .= '.gz'; + open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CHH},"| cat > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only); + push @sorting_files,$chh_ot; + + unless($no_header){ + print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ctot =~ s/^/CHH_CTOT_/; + $chh_ctot =~ s/sam$/txt/; + $chh_ctot =~ s/bam$/txt/; + $chh_ctot =~ s/cram$/txt/; + $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); + $chh_ctot = $output_dir . $chh_ctot; + + if ($gzip){ + $chh_ctot .= '.gz'; + open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CHH},"| cat > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only); + push @sorting_files,$chh_ctot; + + unless($no_header){ + print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ctob =~ s/^/CHH_CTOB_/; + $chh_ctob =~ s/sam$/txt/; + $chh_ctob =~ s/bam$/txt/; + $chh_ctob =~ s/cram$/txt/; + $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); + $chh_ctob = $output_dir . $chh_ctob; + + if ($gzip){ + $chh_ctob .= '.gz'; + open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CHH},"| cat > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only); + push @sorting_files,$chh_ctob; + + unless($no_header){ + print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ob =~ s/^/CHH_OB_/; + $chh_ob =~ s/sam$/txt/; + $chh_ob =~ s/bam$/txt/; + $chh_ob =~ s/cram$/txt/; + $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); + $chh_ob = $output_dir . $chh_ob; + + if ($gzip){ + $chh_ob .= '.gz'; + open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CHH},"| cat > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only); + push @sorting_files,$chh_ob; + + unless($no_header){ + print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + return $report_filename; +} + +sub isBam{ + + my $filename = shift; + + # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy) + open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n"; + + ### when BAM files read through a gunzip -c stream they start with BAM... + my $bam_in_disguise = ; + # warn "BAM in disguise: $bam_in_disguise\n\n"; + + if ($bam_in_disguise){ + if ($bam_in_disguise =~ /^BAM/){ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 1; + } + else{ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 0; + } + } + else{ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 0; + } +} + + +sub print_helpfile{ + + print << 'HOW_TO'; + + +DESCRIPTION + +The following is a brief description of all options to control the Bismark +methylation extractor. The script reads in a bisulfite read alignment results file +produced by the Bismark bisulfite mapper (in BAM/CRAM/SAM format) and extracts the +methylation information for individual cytosines. This information is found in the +methylation call field which can contain the following characters: + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~~~ X for methylated C in CHG context ~~~ + ~~~ x for not methylated C CHG ~~~ + ~~~ H for methylated C in CHH context ~~~ + ~~~ h for not methylated C in CHH context ~~~ + ~~~ Z for methylated C in CpG context ~~~ + ~~~ z for not methylated C in CpG context ~~~ + ~~~ U for methylated C in Unknown context (CN or CHN ~~~ + ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~ + ~~~ . for any bases not involving cytosines ~~~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The methylation extractor outputs result files for cytosines in CpG, CHG and CHH +context (this distinction is actually already made in Bismark itself). As the methylation +information for every C analysed can produce files which easily have tens or even hundreds of +millions of lines, file sizes can become very large and more difficult to handle. The C +methylation info additionally splits cytosine methylation calls up into one of the four possible +strands a given bisulfite read aligned against: + + OT original top strand + CTOT complementary to original top strand + + OB original bottom strand + CTOB complementary to original bottom strand + +Thus, by default twelve individual output files are being generated per input file (unless +--comprehensive is specified, see below). The output files can be imported into a genome +viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact +unless the bisulfite reads were generated preserving directionality it doesn't make any +sense to look at the data in a strand-specific manner). Strand-specific output files can +optionally be skipped, in which case only three output files for CpG, CHG or CHH context +will be generated. For both the strand-specific and comprehensive outputs there is also +the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. + + +The output files are in the following format (tab delimited): + + + + +USAGE: bismark_methylation_extractor [options] + + +ARGUMENTS: +========== + + A space-separated list of Bismark result files in SAM format from + which methylation information is extracted for every cytosine in + the reads. For alignment files in the older custom Bismark output + see option '--vanilla'. + +OPTIONS: + +-s/--single-end Input file(s) are Bismark result file(s) generated from single-end + read data. If neither -s nor -p is set the type of experiment will + be determined automatically. + +-p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end + read data. If neither -s nor -p is set the type of experiment will + be determined automatically. + +--vanilla The Bismark result input file(s) are in the old custom Bismark format + (up to version 0.5.x) and not in SAM format which is the default as + of Bismark version 0.6.x or higher. Default: OFF. + +--no_overlap For paired-end reads it is theoretically possible that read_1 and + read_2 overlap. This option avoids scoring overlapping methylation + calls twice (only methylation calls of read 1 are used for in the process + since read 1 has historically higher quality basecalls than read 2). + Whilst this option removes a bias towards more methylation calls + in the center of sequenced fragments it may de facto remove a sizable + proportion of the data. This option is on by default for paired-end data + but can be disabled using --include_overlap. Default: ON. + +--include_overlap For paired-end data all methylation calls will be extracted irrespective of + of whether they overlap or not. Default: OFF. + +--ignore Ignore the first bp from the 5' end of Read 1 (or single-end alignment + files) when processing the methylation call string. This can remove e.g. a + restriction enzyme site at the start of each read or any other source of + bias (such as PBAT-Seq data). + +--ignore_r2 Ignore the first bp from the 5' end of Read 2 of paired-end sequencing + results only. Since the first couple of bases in Read 2 of BS-Seq experiments + show a severe bias towards non-methylation as a result of end-repairing + sonicated fragments with unmethylated cytosines (see M-bias plot), it is + recommended that the first couple of bp of Read 2 are removed before + starting downstream analysis. Please see the section on M-bias plots in the + Bismark User Guide for more details. + +--ignore_3prime Ignore the last bp from the 3' end of Read 1 (or single-end alignment + files) when processing the methylation call string. This can remove unwanted + biases from the end of reads. + +--ignore_3prime_r2 Ignore the last bp from the 3' end of Read 2 of paired-end sequencing + results only. This can remove unwanted biases from the end of reads. + +--comprehensive Specifying this option will merge all four possible strand-specific + methylation info into context-dependent output files. The default + + contexts are: + - CpG context + - CHG context + - CHH context + +--merge_non_CpG This will produce two output files (in --comprehensive mode) or eight + strand-specific output files (default) for Cs in + - CpG context + - non-CpG context + +--report Prints out a short methylation summary as well as the paramaters used to run + this script. Default: ON. + +--no_header Suppresses the Bismark version header line in all output files for more convenient + batch processing. + +-o/--output DIR Allows specification of a different output directory (absolute or relative + path). If not specified explicitly, the output will be written to the current directory. + +--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified + explicitly if Samtools is in the PATH already. + +--gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in + a GZIP compressed form to save disk space. This option is also passed on to the genome-wide + cytosine report. BedGraph and coverage files will be written out as .gz by default. + +--version Displays version information. + +-h/--help Displays this help file and exits. + +--mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as + well as a report (optional) and then quit. Default: OFF. + +--mbias_off The methylation extractor will process the entire file as usual but doesn't write out any M-bias report. + Only recommended for users who deliberately want to keep an earlier version of the M-bias report. + Default: OFF. + +--multicore Sets the number of cores to be used for the methylation extraction process. If system resources + are plentiful this is a viable option to speed up the extraction process (we observed a near linear + speed increase for up to 10 cores used). Please note that a typical process of extracting a BAM file + and writing out '.gz' output streams will in fact use ~3 cores per value of --multicore + specified (1 for the methylation extractor itself, 1 for a Samtools stream, 1 for GZIP stream), so + --multicore 10 is likely to use around 30 cores of system resources. This option has no bearing + on the bismark2bedGraph or genome-wide cytosine report processes. + + + + +bedGraph specific options: +========================== + +--bedGraph After finishing the methylation extraction, the methylation output is written into a + sorted bedGraph file that reports the position of a given cytosine and its methylation + state (in %, see details below). The methylation extractor output is temporarily split up into + temporary files, one per chromosome (written into the current directory or folder + specified with -o/--output); these temp files are then used for sorting and deleted + afterwards. By default, only cytosines in CpG context will be sorted. The option + '--CX_context' may be used to report all cytosines irrespective of sequence context + (this will take MUCH longer!). The default folder for temporary files during the sorting + process is the output directory. The bedGraph conversion step is performed by the external + module 'bismark2bedGraph'; this script needs to reside in the same folder as the + bismark_methylation_extractor itself. + +--zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start + and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file, + instead of using 1-based coordinates throughout. Default: OFF. + + +--cutoff [threshold] The minimum number of times any methylation state (methylated or unmethylated) has to be seen + for a nucleotide before its methylation percentage is reported. Default: 1. + +--remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. + +--CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered + in the experiment irrespective of its sequence context. This applies to both forward and + reverse strands. Please be aware that this option may generate large temporary and output files + and may take a long time to sort (up to many hours). Default: OFF. + (i.e. Default = CpG context only). + +--buffer_size This allows you to specify the main memory sort buffer when sorting the methylation information. + Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or + a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. + (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. + Defaults to 2G. + +--scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of + scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to + individual chromosome files. These errors were caused by the operating system's limit + of the number of filehandle that can be written to at any one time (typically 1024; to + find out this limit on Linux, type: ulimit -a). + To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort + methylation calls into individual chromosome files. Instead, all input files are + temporarily merged into a single file (unless there is only a single file), and this + file will then be sorted by both chromosome AND position using the Unix sort command. + Please be aware that this option might take a looooong time to complete, depending on + the size of the input files, and the memory you allocate to this process (see --buffer_size). + Nevertheless, it seems to be working. + +--ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will + instead use two arrays to sort methylated and unmethylated calls. This may result in a faster + sorting process of very large files, but this comes at the cost of a larger memory footprint + (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB + of RAM). Due to overheads in creating and looping through these arrays it seems that it will + actually be *slower* for small files (few million alignments), and we are currently testing at + which point it is advisable to use this option. Note that --ample_memory is not compatible + with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with). + + + +Genome-wide cytosine methylation report specific options: +========================================================= + +--cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a + genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based + chromosome coordinates (zero-based start coords are optional) and reports CpG context only (all + cytosine context is optional). The output considers all Cs on both forward and reverse strands and + reports their position, strand, trinucleotide content and methylation state (counts are 0 if not + covered). The cytosine report conversion step is performed by the external module + 'coverage2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor + itself. + +--CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of + its context. This applies to both forward and reverse strands. Please be aware that this will + generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. + Default: OFF (i.e. Default = CpG context only). + +--zero_based Uses 0-based genomic coordinates instead of 1-based coordinates. Default: OFF. + +--genome_folder Enter the genome folder you wish to use to extract sequences from (full path only). Accepted + formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. + +--split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files + will be named to include the input filename and the chromosome number. + + + +OUTPUT: + +The bismark_methylation_extractor output is in the form: +======================================================== + + +* Methylated cytosines receive a '+' orientation, +* Unmethylated cytosines receive a '-' orientation. + + + +The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords): +========================================================================================================= + +track type=bedGraph (header line) + + + + + +The coverage output looks like this (tab-delimited, 1-based genomic coords; zero-based half-open coordinates available with '--zero_based'): +============================================================================================================================================ + + + + + +The genome-wide cytosine methylation output file is tab-delimited in the following format: +========================================================================================== + + + + +This script was last modified on 25 February 2016. + +HOW_TO +} diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_methyl_extractor/bismark_methylation_extractor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_methyl_extractor/bismark_methylation_extractor.py Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,181 @@ +#!/usr/bin/env python + +import argparse, os, shutil, subprocess, sys, tempfile, fileinput +import zipfile +import re +from glob import glob + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def zipper(dir, zip_file): + output_files_regex = re.compile('^(Non_)?C[pH][GH]_.*') + bedgraph_regex = re.compile('.*bedGraph.gz') + zip = zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED) + root_len = len(os.path.abspath(dir)) + for root, dirs, files in os.walk(dir): + archive_root = os.path.abspath(root)[root_len:] + for f in files: + if re.search(output_files_regex, f) or re.search(bedgraph_regex, f): + fullpath = os.path.join(root, f) + archive_name = os.path.join(archive_root, f) + zip.write(fullpath, archive_name, zipfile.ZIP_DEFLATED) + zip.close() + return zip_file + +def build_genome_dir(genome_file): + tmp_genome_dir = tempfile.mkdtemp(prefix='tmp') + genome_path = os.path.join(tmp_genome_dir, '.'.join(os.path.split(genome_file)[1].split('.')[:-1])) + try: + """ + Create a hard link pointing to genome_file named 'genome_path'.fa. + """ + os.symlink(genome_file, genome_path + '.fa') + except Exception, e: + if os.path.exists(tmp_genome_dir): + shutil.rmtree(tmp_genome_dir) + stop_err('Error in linking the reference database.\n' + str(e)) + return tmp_genome_dir + +def __main__(): + #Parse Command Line + parser = argparse.ArgumentParser(description='Wrapper for the bismark methylation caller.') + + # input options + parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' ) + + parser.add_argument( '--infile', help='Input file in SAM or BAM format.' ) + parser.add_argument( '--single-end', dest='single_end', action="store_true" ) + parser.add_argument( '--paired-end', dest='paired_end', action="store_true" ) + + parser.add_argument('--splitting_report', dest='splitting_report') + parser.add_argument('--mbias_report', dest='mbias_report') + parser.add_argument('--cytosine_report', dest="cytosine_report") + parser.add_argument('--genome_file', dest="genome_file") + parser.add_argument('--cx_context', action="store_true" ) + + parser.add_argument( '--comprehensive', action="store_true" ) + parser.add_argument( '--merge-non-cpg', dest='merge_non_cpg', action="store_true" ) + parser.add_argument( '--no-overlap', dest='no_overlap', action="store_true" ) + parser.add_argument( '--compress' ) + parser.add_argument('--ignore', dest='ignore', type=int) + parser.add_argument('--ignore_r2', dest='ignore_r2', type=int) + parser.add_argument('--ignore_3prime', dest='ignore_3prime', type=int) + parser.add_argument('--ignore_3prime_r2', dest='ignore_3prime_r2', type=int) + + args = parser.parse_args() + + # Build methylation extractor command + output_dir = tempfile.mkdtemp() + cmd = 'bismark_methylation_extractor --no_header -o %s %s %s' + if args.bismark_path: + # add the path to the bismark perl scripts, that is needed for galaxy + cmd = os.path.join(args.bismark_path, cmd) + + # Set up all options + additional_opts = '' + if args.single_end: + additional_opts += ' --single-end ' + else: + additional_opts += ' --paired-end ' + if args.no_overlap: + additional_opts += ' --no_overlap ' + if args.ignore: + additional_opts += ' --ignore %s ' % args.ignore + if args.ignore_r2: + additional_opts += ' --ignore_r2 %s ' % args.ignore_r2 + if args.ignore_3prime: + additional_opts += ' --ignore_3prime %s ' % args.ignore_3prime + if args.ignore_3prime_r2: + additional_opts += ' --ignore_3prime_r2 %s ' % args.ignore_3prime_r2 + if args.comprehensive: + additional_opts += ' --comprehensive ' + if args.merge_non_cpg: + additional_opts += ' --merge_non_CpG ' + if args.splitting_report: + additional_opts += ' --report ' + if args.cytosine_report: + tmp_genome_dir = build_genome_dir(args.genome_file) + if args.cx_context: + additional_opts += ' --bedgraph --CX_context --cytosine_report --CX_context --genome_folder %s ' % tmp_genome_dir + else: + additional_opts += ' --bedgraph --cytosine_report --genome_folder %s ' % tmp_genome_dir + + + #detect BAM file, use samtools view if it is a bam file + f = open (args.infile, 'rb') + sig = f.read(4) + f.close() + if sig == '\x1f\x8b\x08\x04' : + #cmd = cmd % (output_dir, additional_opts, '-') + new_infilename = os.path.join(output_dir, 'submitted_bs_mapped_reads.sam') + new_sam = open(new_infilename, 'wb') + tmp_err = tempfile.NamedTemporaryFile().name + tmp_stderr = open(tmp_err, 'wb') + proc = subprocess.Popen(['samtools', 'view', args.infile], stdout=new_sam, stderr=tmp_stderr) + new_sam.close() + tmp_stderr.close() + if os.stat(tmp_err).st_size != 0: + tmp_sterr = open(tmp_err, 'rb') + error_msg = tmp_sterr.read() + tmp_sterr.close() + sys.exit("error: %s" % error_msg) + cmd = cmd % (output_dir, additional_opts, new_infilename) + else: + cmd = cmd % (output_dir, additional_opts, args.infile) + + # Run + try: + tmp_out = tempfile.NamedTemporaryFile().name + tmp_stdout = open( tmp_out, 'wb' ) + tmp_err = tempfile.NamedTemporaryFile().name + tmp_stderr = open( tmp_err, 'wb' ) + proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) + returncode = proc.wait() + tmp_stderr.close() + # get stderr, allowing for case where it's very large + tmp_stderr = open( tmp_err, 'rb' ) + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += tmp_stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + tmp_stdout.close() + tmp_stderr.close() + if returncode != 0: + raise Exception, stderr + + # TODO: look for errors in program output. + except Exception, e: + stop_err( 'Error in bismark methylation extractor:\n' + str( e ) ) + + # collect and copy output files + if args.compress: + zipper(output_dir, args.compress) + + # cytosine report + if args.cytosine_report: + if args.cx_context: + shutil.move( glob(os.path.join( output_dir, '*CX_report.txt'))[0], args.cytosine_report ) + else: + shutil.move(glob(os.path.join(output_dir, '*CpG_report.txt'))[0], args.cytosine_report) + # splitting report + if args.splitting_report: + shutil.move( glob(os.path.join( output_dir, '*_splitting_report.txt'))[0], args.splitting_report ) + if args.mbias_report: + shutil.move(glob(os.path.join(output_dir, '*M-bias.txt'))[0], args.mbias_report) + + + #Clean up temp dirs + if os.path.exists( output_dir ): + shutil.rmtree( output_dir ) + if args.cytosine_report: + if os.path.exists( tmp_genome_dir ): + shutil.rmtree( tmp_genome_dir ) + +if __name__=="__main__": __main__() diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_methyl_extractor/bismark_methylation_extractor.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_methyl_extractor/bismark_methylation_extractor.xml Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,304 @@ + + + Reports on methylation status of reads mapped by Bismark + + + + samtools + bowtie2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+ + + + ( splitting_report is True ) + + + ( mbias_report is True ) + + + ( cytosine_report['cytosine_report_selector'] ) + + + ( output_settings['compress'] ) + + + + + + + +* + + * **Ignore the last N bp from the 3' end of single-end read when processing the methylation call string.** + + | This can remove unwanted biases from the end of reads. + | + | *Input option --ignore_3prime * + +* **If Paired-End Reads** + + * **Ignore the first N bp from the 5’ end of Read 1 when processing the methylation call string** + + | This can remove e.g. a restriction enzyme site at the start of each read or any other source of bias (e.g. PBAT-Seq data). + | + | *Input option --ignore * + + * **Ignore the last N bp from the 3’ end of Read 1 when processing the methylation call string** + + | This can remove unwanted biases from the end of reads. + | + | *Input option --ignore_3prime * + + * **Ignore the first N bp from the 5' end of Read 2 of paired-end sequencing results** + + | Since the first couple of bases in Read 2 of BS-Seq experiments show a severe bias towards non-methylation as a result of end-repairing sonicated fragments with unmethylated cytosines (see M-bias plot), it is recommended that the first couple of bp of Read 2 are removed before starting downstream analysis. Please see the section on M-bias plots in the Bismark User Guide for more details. The options --ignore and --ignore_r2 can be combined in any desired way. + | + | *Input option --ignore_r2* + + * **Ignore the last N bp from the 3' end of Read 2 of paired-end sequencing results** + + | This can remove unwanted biases from the end of reads. + | + | *Input option --ignore_3prime_r2* + + * **This option avoids scoring overlapping methylation calls twice, in case of overlapping read one and read two** + + | For paired-end reads it is theoretically possible that read_1 and read_2 overlap. This option avoids scoring overlapping methylation calls twice (only methylation calls of read 1 are used for in the process since read 1 has historically higher quality basecalls than read 2). Whilst this option removes a bias towards more methylation calls in the center of sequenced fragments it may de facto remove a sizable proportion of the data. This option is highly recommended for paired-end data. + | + | *Input option --no_overlap* + +* **Short methylation summary output (Splitting Report)** + + | Prints out a short methylation summary as well as the paramaters used to run this script. + | + | *Output option --report* + +* **Methylation proportion report for each possible position in the read (Mbias Report)** + + | This report shows the methylation proportion across each possible position in the read (described in further detail in:Hansen et al., Genome Biology, 2012, 13:R83). The data for the M-bias plot is also written into a text file and is in the following format: + | + | <% methylation> + | + | This allows generating nice graphs by alternative means, e.g. using R or Excel + +* **Genome-wide methylation report for all cytosines in the genome** + + | the option --cytosine_report produces a genome-wide methylation report for all cytosines in the genome. + + * **If CpG Context only** + + | the output uses 1-based chromosome coordinates (zero-based cords are optional) and reports CpG context only (all cytosine context is optional). The output considers all Cs on both forward and reverse strands and reports their position, strand, trinucleotide content and methylation state (counts are 0 if not covered). + | + | *Genome-wide cytosine methylation report specific option --bedgraph --cytosine_report --genome_folder * + + * **If not CpG Context only** + + | The output file contains information on every single cytosine in the genome irrespective of its context. This applies to both forward and reverse strands. Please be aware that this will generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. Default: OFF (i.e. Default = CpG context only). + | + | *Genome-wide cytosine methylation report specific option --bedgraph --CX_context --cytosine_report --CX_context --genome_folder * + +* **Merge all four possible strand-specific methylation info into context-dependent output files** + + | Specifying this option will merge all four possible strand-specific methylation info into context-dependent output files. The default contexts are: + | - CpG context + | - CHG context + | - CHH context + | + | *Output option --comprehensive* + +* **Merge all non-CpG contexts into one file** + + | This will produce two output files (in --comprehensive mode) or eight strand-specific output files (default) for Cs in + | - CpG context + | - non-CpG context + | + | *Output option --merge_non_CpG* + +* **Compress all result files and output one single file** + + | The methylation extractor files (CpG_OT..., CpG_OB... etc) will be written out in a GZIP compressed form to save disk space. This option does not work on bedGraph and genome-wide cytosine reports as they are 'tiny' anyway. + | + | *Output option --gzip* + +]]> + + + + 10.1093/bioinformatics/btr167 + +
diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_methyl_extractor/coverage2cytosine --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_methyl_extractor/coverage2cytosine Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,1254 @@ +#!/usr/bin/env perl +use warnings; +use strict; +$|++; +use Getopt::Long; +use Cwd; +use Carp; + +## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +my %chromosomes; # storing sequence information of all chromosomes/scaffolds +my %processed; # keeping a record of which chromosomes have been processed +my $coverage2cytosine_version = 'v0.16.3'; + +my ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra) = process_commandline(); + +warn "Summary of parameters for genome-wide cytosine report:\n"; +warn '='x78,"\n"; +warn "Coverage infile:\t\t\t$coverage_infile\n"; +warn "Output directory:\t\t\t>$output_dir<\n"; +warn "Parent directory:\t\t\t>$parent_dir<\n"; +warn "Genome directory:\t\t\t>$genome_folder<\n"; + +if ($CX_context){ + warn "CX context:\t\t\t\tyes\n"; +} +else{ + warn "CX context:\t\t\t\tno (CpG context only, default)\n"; +} +if ($merge_CpGs){ + warn "Pooling CpG top/bottom strand evidence:\tyes\n"; +} +if($gc_context){ + warn "Optional GC context track:\t\tyes\n"; +} +if ($tetra){ + warn "Tetra/Penta nucleotide context:\t\tyes\n"; +} + +if ($zero){ + warn "Genome coordinates used:\t\t0-based (user specified)\n"; +} +else{ + warn "Genome coordinates used:\t\t1-based (default)\n"; +} + +if ($gzip){ + warn "GZIP compression:\t\t\tyes\n"; +} +else{ + warn "GZIP compression:\t\t\tno\n"; +} + +if ($split_by_chromosome){ + warn "Split by chromosome:\t\t\tyes\n\n\n"; +} +else{ + warn "Split by chromosome:\t\t\tno\n\n\n"; +} +sleep (3); + +read_genome_into_memory(); +warn "Stored sequence information of ",scalar keys %chromosomes," chromosomes/scaffolds in total\n\n"; + +generate_genome_wide_cytosine_report($coverage_infile); + +### 11 December 2014 + +# The following optional section re-reads the genome-wide report and merges methylation evidence of both top and bottom strand +# into a single CpG dinucleotide entity. This significantly simplifies downstream processing, e.g. by the bsseq R-/Bioconductor package +# which recommends this merging process to increase coverage per CpG and reduce the memory burden for its processing + +if ($merge_CpGs){ + # we only allow this operation if the report is limited to CpG context, and for a single report for the entire genome for the time being + combine_CpGs_to_single_CG_entity($cytosine_out); +} + +### 18 August 2015 + +# The following section reprocessed the genome to generate cytosine methylation output in GC context (e.g. when a GpC methylase had been deployed +if ($gc_context){ + generate_GC_context_report($coverage_infile); +} + + +sub combine_CpGs_to_single_CG_entity{ + my $CpG_report_file = shift; + warn "Now merging top and bottom strand CpGs into a single CG dinucleotide entity\n"; + + open (IN,$CpG_report_file) or die "Failed to open file $CpG_report_file: $!\n\n"; + my $pooled_CG = $CpG_report_file; + $pooled_CG =~ s/$/.merged_CpG_evidence.cov/; + open (OUT,'>',$pooled_CG) or die "Failed to write to file '$pooled_CG': $!\n\n"; + warn ">>> Writing a new coverage file with top and bottom strand CpG methylation evidence merged to $pooled_CG <<<\n\n"; + sleep(1); + + while (1){ + my $line1 = ; + my $line2 = ; + last unless ($line1 and $line2); + + chomp $line1; + chomp $line2; + + my ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1); + my ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2); + # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n"; + # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; sleep(1); + + if ($pos1 < 2){ + $line1 = $line2; + $line2 = ; + chomp $line2; + ($chr1,$pos1,$strand1,$m1,$u1,$context1) = (split /\t/,$line1); + ($chr2,$pos2,$strand2,$m2,$u2,$context2) = (split /\t/,$line2); + } + + # a few sanity checks + die "The context of the first line was not CG:\t$line1" unless ($context1 eq 'CG'); + die "The context of the second line was not CG:\t$line2" unless ($context2 eq 'CG'); + + unless ($strand1 eq '+' and $strand2 eq'-'){ + die "The strand of line 1 and line 2 were not + and -:\nline1:\t$line1\nline2:\t$line2\n"; + } + unless ($pos2 eq ($pos1 + 1)){ + die "The reported position were not spaced 1bp apart:\nline1:\t$line1\nline2:\t$line2\n"; + } + unless($chr1 eq $chr2){ + die "The chromsome information for line 1 and 2 did not match:\nline1:\t$line1\nline2:\t$line2\n"; + } + + # This looks alright from what I can tell, lets pool the 2 strands + + my $pooled_m = $m1 + $m2; + my $pooled_u = $u1 + $u2; + + # since this is a new coverage file we only write it out if the coverage is at least 1 + next unless ($pooled_m + $pooled_u) > 0; + + my $pooled_percentage = sprintf ("%.6f",$pooled_m/ ($pooled_m + $pooled_u) *100); + # print join ("\t",$chr1,$pos1,$strand1,$m1,$u1,$context1),"\n"; + # print join ("\t",$chr2,$pos2,$strand2,$m2,$u2,$context2),"\n"; + # print join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n"; + # sleep(1); + print OUT join ("\t",$chr1,$pos1,$pos2,$pooled_percentage,$pooled_m,$pooled_u),"\n"; + } + + warn "CpG merging complete. Good luck finding DMRs with bsseq!\n\n"; + +} + + +sub process_commandline{ + my $help; + my $output_dir; + my $genome_folder; + my $zero; + my $CpG_only; + my $CX_context; + my $gzip; + + my $split_by_chromosome; + my $cytosine_out; + my $parent_dir; + my $version; + my $merge_CpGs; + my $gc_context; + my $tetra; + + my $command_line = GetOptions ('help|man' => \$help, + 'dir=s' => \$output_dir, + 'g|genome_folder=s' => \$genome_folder, + "zero_based" => \$zero, + "CX|CX_context" => \$CX_context, + "split_by_chromosome" => \$split_by_chromosome, + 'o|output=s' => \$cytosine_out, + 'parent_dir=s' => \$parent_dir, + 'version' => \$version, + 'merge_CpGs' => \$merge_CpGs, + 'GC|GC_context' => \$gc_context, + 'ff|qq' => \$tetra, + 'gzip' => \$gzip, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($version){ + print << "VERSION"; + + + Bismark Methylation Extractor Module - + coverage2cytosine + + Bismark Extractor Version: $coverage2cytosine_version + Copyright 2010-15 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + ### no files provided + unless (@ARGV){ + warn "You need to provide a Bismark coverage file (with counts methylated/unmethylated cytosines) to create an individual C methylation output. Please respecify!\n"; + sleep(2); + + print_helpfile(); + exit; + } + + my $coverage_infile = shift @ARGV; + + unless ($parent_dir){ + $parent_dir = getcwd(); + } + unless ($parent_dir =~ /\/$/){ + $parent_dir =~ s/$/\//; + } + + unless (defined $cytosine_out){ + die "Please provide the name of the output file using the option -o/--output filename\n"; + } + + ### OUTPUT DIR PATH + if (defined $output_dir){ + unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + } + else{ + $output_dir = ''; + } + + unless ($CX_context){ + $CX_context = 0; + $CpG_only = 1; + } + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + } + else{ + die "Please specify a genome folder to proceed (full path only)\n"; + } + + if ($merge_CpGs){ + if ($CX_context){ + die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if CpG-context is selected only (lose the option --CX)\n"; + } + if ($split_by_chromosome){ + die "Merging individual CpG calls into a single CpG dinucleotide entity is currently only supported if a single CpG report is written out (lose the option --split_by_chromosome)\n"; + } + } + + return ($output_dir,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$parent_dir,$coverage_infile,$cytosine_out,$merge_CpGs,$gc_context,$gzip,$tetra); +} + + + +sub generate_genome_wide_cytosine_report { + + warn "="x78,"\n"; + warn "Methylation information will now be written into a genome-wide cytosine report\n"; + warn "="x78,"\n\n"; + sleep (2); + + my $number_processed = 0; + + ### changing to the output directory again + unless ($output_dir eq ''){ # default + chdir $output_dir or die "Failed to change directory to $output_dir\n"; + # warn "Changed directory to $output_dir\n"; + } + + my $in = shift; + # infiles handed over by the methylation extractor will be just the filename on their own. The directory should have been handed over with --dir + if ($in =~ /gz$/){ + open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; # changed from gunzip -c to gunzip -c 08 04 16 + } + else{ + open (IN,"$in") or die "Failed to read from file $in: $!\n"; + } + + + ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands + unless ($split_by_chromosome){ ### writing all output to a single file (default) + if ($gzip){ + unless ($cytosine_out =~ /\.gz$/){ + $cytosine_out .= '.gz'; + } + open (CYT,"| gzip -c - > $cytosine_out") or die "Failed to write to file $cytosine_out: $!\n"; + } + else{ + open (CYT,'>',$cytosine_out) or die "Failed to write to file $cytosine_out: $!\n"; + } + warn ">>> Writing genome-wide cytosine report to: $cytosine_out <<<\n\n"; + sleep (1); + } + + my $last_chr; + my %chr; # storing reads for one chromosome at a time + + my $count = 0; + while (){ + chomp; + ++$count; + my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); + + # defining the first chromosome + unless (defined $last_chr){ + $last_chr = $chr; + ++$number_processed; + # warn "Storing all covered cytosine positions for chromosome: $chr\n"; + } + + ### As of version 0.9.1 the start positions are 1-based! + if ($chr eq $last_chr){ + $chr{$chr}->{$start}->{meth} = $meth; + $chr{$chr}->{$start}->{nonmeth} = $nonmeth; + } + else{ + warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + ++$number_processed; + + if ($split_by_chromosome){ ## writing output to 1 file per chromosome + my $chromosome_out = $cytosine_out; + if ($chromosome_out =~ /\.txt$/){ + $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/; + } + else{ + $chromosome_out =~ s/$/.chr${last_chr}.txt/; + } + open (CYT,'>',$chromosome_out) or die $!; + # warn "Writing output for $last_chr to $chromosome_out\n"; + } + + my $tri_nt; + my $tetra_nt; + my $penta_nt; + my $context; + + $processed{$last_chr} = 1; + while ( $chromosomes{$last_chr} =~ /([CG])/g){ + + $tri_nt = ''; + $context = ''; + + if ($tetra){ + $tetra_nt = ''; # clearing + $penta_nt = ''; + } + + my $pos = pos$chromosomes{$last_chr}; + + my $strand; + my $meth = 0; + my $nonmeth = 0; + + if ($1 eq 'C'){ # C on forward strand + $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! + if ($tetra){ + if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){ + $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4); + } + else{ + $tetra_nt = ''; + } + if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){ + $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5); + } + else{ + $penta_nt = ''; + } + } + $strand = '+'; + } + elsif ($1 eq 'G'){ # C on reverse strand + + $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! + $tri_nt = reverse $tri_nt; + $tri_nt =~ tr/ACTG/TGAC/; + + if ($tetra){ + if ( $pos - 4 >= 0 ){ + $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4); + $tetra_nt = reverse $tetra_nt; + $tetra_nt =~ tr/ACTG/TGAC/; + } + else{ + $tetra_nt = ''; + } + if ( $pos - 5 >= 0 ){ + $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5); + $penta_nt = reverse $penta_nt; + $penta_nt =~ tr/ACTG/TGAC/; + } + else{ + $penta_nt = ''; + } + } + $strand = '-'; + } + + next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted + + # if (length$penta_nt < 5){ + # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1); + # } + + if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! (as of v0.9.1) + $meth = $chr{$last_chr}->{$pos}->{meth}; + $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth}; + } + + + ### determining cytosine context + if ($tri_nt =~ /^CG/){ + $context = 'CG'; + } + elsif ($tri_nt =~ /^C.{1}G$/){ + $context = 'CHG'; + } + elsif ($tri_nt =~ /^C.{2}$/){ + $context = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) + warn "The sequence context could not be determined (found: '$tri_nt'). Skipping.\n"; + next; + } + + if ($CpG_only){ + if ($tri_nt =~ /^CG/){ # CpG context is the default + if ($zero){ # zero based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + else{ ## all cytosines, specified with --CX + if ($zero){ # zero based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n" + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + + %chr = (); # resetting the hash + + # new first entry + $last_chr = $chr; + $chr{$chr}->{$start}->{meth} = $meth; + $chr{$chr}->{$start}->{nonmeth} = $nonmeth; + } + } + + # Last found chromosome + # If there never was a last chromosome then something must have gone wrong with reading the data in + unless (defined $last_chr){ + die "No last chromosome was defined, something must have gone wrong while reading the data in (e.g. specified wrong file path for a gzipped coverage file?). Please check your command!\n\n"; + } + + warn "Writing cytosine report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + $processed{$last_chr} = 1; + + if ($split_by_chromosome){ ## writing output to 1 file per chromosome + my $chromosome_out = $cytosine_out; + if ($chromosome_out =~ /\.txt$/){ # files passed on by the methylation extractor end in _report.txt + $chromosome_out =~ s/\.txt$/chr${last_chr}.txt/; + } + else{ # user specified output file name + $chromosome_out =~ s/$/.chr${last_chr}.txt/; + } + open (CYT,'>',$chromosome_out) or die $!; + # warn "Writing output for $last_chr to $chromosome_out\n"; + } + + my $tri_nt; + my $tetra_nt; + my $penta_nt; + my $context; + + while ( $chromosomes{$last_chr} =~ /([CG])/g){ + + $tri_nt = ''; + $context = ''; + + if ($tetra){ + $tetra_nt = ''; + $penta_nt = ''; + } + + my $pos = pos$chromosomes{$last_chr}; + + my $strand; + my $meth = 0; + my $nonmeth = 0; + + if ($1 eq 'C'){ # C on forward strand + $tri_nt = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! + $strand = '+'; + + if ($tetra){ + if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 4) ){ + $tetra_nt = substr ($chromosomes{$last_chr},($pos-1),4); + } + else{ + $tetra_nt = ''; + } + if ( length($chromosomes{$last_chr}) >= ($pos - 1 + 5) ){ + $penta_nt = substr ($chromosomes{$last_chr},($pos-1),5); + } + else{ + $penta_nt = ''; + } + } + + } + elsif ($1 eq 'G'){ # C on reverse strand + $tri_nt = substr ($chromosomes{$last_chr},($pos-3),3); # positions are 0-based! + $tri_nt = reverse $tri_nt; + $tri_nt =~ tr/ACTG/TGAC/; + $strand = '-'; + + if ($tetra){ + if ( $pos - 4 >= 0 ){ + $tetra_nt = substr ($chromosomes{$last_chr},($pos-4),4); + $tetra_nt = reverse $tetra_nt; + $tetra_nt =~ tr/ACTG/TGAC/; + } + else{ + $tetra_nt = ''; + } + + if ( $pos - 5 >= 0 ){ + $penta_nt = substr ($chromosomes{$last_chr},($pos-5),5); + $penta_nt = reverse $penta_nt; + $penta_nt =~ tr/ACTG/TGAC/; + } + else{ + $penta_nt = ''; + } + } + } + + if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! as of v0.9.1 + $meth = $chr{$last_chr}->{$pos}->{meth}; + $nonmeth = $chr{$last_chr}->{$pos}->{nonmeth}; + } + + next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted + # if (length$penta_nt < 5){ + # warn "$tri_nt\t$tetra_nt\t$penta_nt\n"; sleep(1); + # } + + ### determining cytosine context + if ($tri_nt =~ /^CG/){ + $context = 'CG'; + } + elsif ($tri_nt =~ /^C.{1}G$/){ + $context = 'CHG'; + } + elsif ($tri_nt =~ /^C.{2}$/){ + $context = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) + warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; + next; + } + + if ($CpG_only){ + if ($tri_nt =~ /^CG/){ # CpG context is the default + if ($zero){ # zero-based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + else{ ## all cytosines, specified with --CX + if ($zero){ # zero based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$last_chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + + warn "Finished writing out cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n"; + + ### Now processing chromosomes that were not covered in the coverage file + warn "Now processing chromosomes that were not covered by any methylation calls in the coverage file...\n"; + my $unprocessed = 0; + + foreach my $chr (sort keys %processed) { + unless ( $processed{$chr} ) { + ++$unprocessed; + ++$number_processed; + process_unprocessed_chromosomes($chr); + } + } + + if ($unprocessed == 0) { + warn "All chromosomes in the genome were covered by at least some reads. coverage2cytosine processing complete.\n\n"; + } + else{ + warn "Finished writing out cytosine report (processed $number_processed chromosomes/scaffolds in total). coverage2cytosine processing complete.\n\n"; + } + + close CYT or warn $!; + +} + + +#### GC CONTEXT - optional +#### + +sub generate_GC_context_report { + + warn "="x82,"\n"; + warn "Methylation information for GC context will now be written to a GpC-context report\n"; + warn "="x82,"\n\n"; + sleep (2); + + my $number_processed = 0; + + ### changing to the output directory again + unless ($output_dir eq ''){ # default + chdir $output_dir or die "Failed to change directory to $output_dir\n"; + # warn "Changed directory to $output_dir\n"; + } + + my $in = shift; + if ($in =~ /gz$/){ + open (IN,"gunzip -c $in |") or die "Failed to read from gzipped file $in: $!\n"; + } + else{ + open (IN,"$in") or die "Failed to read from file $in: $!\n"; + } + + my $gc_out = $cytosine_out.'.GpC_report.txt'; + my $gc_cov = $cytosine_out.'.GpC.cov'; + + ### note: we are still in the folder: $output_dir, so we do not have to include this into the open commands + open (GC,'>',$gc_out) or die "Failed to write to GpC file $gc_out: !\n\n"; + warn ">>> Writing genome-wide GpC cytosine report to: $gc_out <<<\n"; + open (GCCOV,'>',$gc_cov) or die "Failed to write to GpC coverage file $gc_cov: !\n\n"; + warn ">>> Writing genome-wide GpC coverage file to: $gc_cov <<<\n\n"; + + my $last_chr; + my %chr; # storing reads for one chromosome at a time + + my $count = 0; + while (){ + chomp; + ++$count; + my ($chr,$start,$end,undef,$meth,$nonmeth) = (split /\t/); + + # defining the first chromosome + unless (defined $last_chr){ + $last_chr = $chr; + ++$number_processed; + # warn "Storing all covered cytosine positions for chromosome: $chr\n"; + } + + ### start positions are 1-based + if ($chr eq $last_chr){ + $chr{$chr}->{$start}->{meth} = $meth; + $chr{$chr}->{$start}->{nonmeth} = $nonmeth; + } + else{ + warn "Writing cytosine report for chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + ++$number_processed; + $processed{$last_chr} = 1; + while ( $chromosomes{$last_chr} =~ /(GC)/g){ + + # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time + my $context_top = ''; + my $context_bottom = ''; + my $pos = pos$chromosomes{$last_chr}; + + my $meth_top = 0; + my $meth_bottom = 0; + my $nonmeth_top = 0; + my $nonmeth_bottom = 0; + + #warn "$1\n"; sleep(1); + # C on forward strand + my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! + my $strand_top = '+'; + + # C on reverse strand + my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based! + $tri_nt_bottom = reverse $tri_nt_bottom; + $tri_nt_bottom =~ tr/ACTG/TGAC/; + my $strand_bottom = '-'; + + next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand + next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the reverse strand + + # top strand + if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! + $meth_top = $chr{$last_chr}->{$pos}->{meth}; + $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth}; + } + # bottom strand + if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs + $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth}; + $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth}; + } + + ### determining cytosine context top strand + if ($tri_nt_top =~ /^CG/){ + $context_top = 'CG'; + } + elsif ($tri_nt_top =~ /^C.{1}G$/){ + $context_top = 'CHG'; + } + elsif ($tri_nt_top =~ /^C.{2}$/){ + $context_top = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) + warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n"; + next; + } + + ### determining cytosine context bottom strand + if ($tri_nt_bottom =~ /^CG/){ + $context_bottom = 'CG'; + } + elsif ($tri_nt_bottom =~ /^C.{1}G$/){ + $context_bottom = 'CHG'; + } + elsif ($tri_nt_bottom =~ /^C.{2}$/){ + $context_bottom = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed + warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n"; + next; + } + + # if Cs were not covered at all they are not written out + unless ($meth_bottom + $nonmeth_bottom == 0){ + my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100); + print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n"; + print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n"; + } + unless ($meth_top + $nonmeth_top == 0){ + my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100); + print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n"; + print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n"; + } + + } + + %chr = (); # resetting the hash + + # new first entry + $last_chr = $chr; + $chr{$chr}->{$start}->{meth} = $meth; + $chr{$chr}->{$start}->{nonmeth} = $nonmeth; + } + } + + # Last found chromosome + warn "Writing cytosine GpC report for last chromosome $last_chr (stored ",scalar keys %{$chr{$last_chr}}," different covered positions)\n"; + $processed{$last_chr} = 1; + while ( $chromosomes{$last_chr} =~ /(GC)/g){ + + # a GC on the top strand automatically means that there is a GC on the bottom strand as well, so we can process both at the same time + my $context_top = ''; + my $context_bottom = ''; + my $pos = pos$chromosomes{$last_chr}; + + my $meth_top = 0; + my $meth_bottom = 0; + my $nonmeth_top = 0; + my $nonmeth_bottom = 0; + + # C on forward strand + my $tri_nt_top = substr ($chromosomes{$last_chr},($pos-1),3); # positions are 0-based! + my $strand_top = '+'; + + # C on reverse strand + my $tri_nt_bottom = substr ($chromosomes{$last_chr},($pos-4),3); # positions are 0-based! + $tri_nt_bottom = reverse $tri_nt_bottom; + $tri_nt_bottom =~ tr/ACTG/TGAC/; + my $strand_bottom = '-'; + + next if (length $tri_nt_top < 3); # trinucleotide sequence could not be extracted for the top strand + next if (length $tri_nt_bottom < 3); # trinucleotide sequence could not be extracted for the bottom strand + + ### determining cytosine context top strand + if ($tri_nt_top =~ /^CG/){ + $context_top = 'CG'; + } + elsif ($tri_nt_top =~ /^C.{1}G$/){ + $context_top = 'CHG'; + } + elsif ($tri_nt_top =~ /^C.{2}$/){ + $context_top = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed + warn "The sequence context could not be determined for the top strand (found: '$tri_nt_top'). Skipping.\n"; + next; + } + + ### determining cytosine context bottom strand + if ($tri_nt_bottom =~ /^CG/){ + $context_bottom = 'CG'; + } + elsif ($tri_nt_bottom =~ /^C.{1}G$/){ + $context_bottom = 'CHG'; + } + elsif ($tri_nt_bottom =~ /^C.{2}$/){ + $context_bottom = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed + warn "The sequence context could not be determined for the bottom strand (found: '$tri_nt_bottom'). Skipping.\n"; + next; + } + + # top strand + if (exists $chr{$last_chr}->{($pos)}){ # stored positions are 1-based! + $meth_top = $chr{$last_chr}->{$pos}->{meth}; + $nonmeth_top = $chr{$last_chr}->{$pos}->{nonmeth}; + } + # bottom strand + if (exists $chr{$last_chr}->{($pos-1)}){ # stored positions are 1-based! -1 for bottom strand Cs + $meth_bottom = $chr{$last_chr}->{$pos-1}->{meth}; + $nonmeth_bottom = $chr{$last_chr}->{$pos-1}->{nonmeth}; + } + + # if Cs were not covered at all they are not written out + unless ($meth_bottom + $nonmeth_bottom == 0){ + my $percentage = sprintf ("%.6f",$meth_bottom/ ($meth_bottom + $nonmeth_bottom) *100); + print GCCOV join ("\t",$last_chr,$pos-1,$pos-1,$percentage,$meth_bottom,$nonmeth_bottom),"\n"; + print GC join ("\t",$last_chr,$pos-1,$strand_bottom,$meth_bottom,$nonmeth_bottom,$context_bottom,$tri_nt_bottom),"\n"; + } + unless ($meth_top + $nonmeth_top == 0){ + my $percentage = sprintf ("%.6f",$meth_top/ ($meth_top + $nonmeth_top) *100); + print GCCOV join ("\t",$last_chr,$pos,$pos,$percentage,$meth_top,$nonmeth_top),"\n"; + print GC join ("\t",$last_chr,$pos,$strand_top,$meth_top,$nonmeth_top,$context_top,$tri_nt_top),"\n"; + } + } + + warn "Finished writing out GpC cytosine report for covered chromosomes (processed $number_processed chromosomes/scaffolds in total)\n\n"; + + close GC or warn $!; + +} + +#### +#### + + +sub process_unprocessed_chromosomes{ + + my $chr = shift; + + warn "Writing cytosine report for not covered chromosome $chr\n"; + $processed{$chr} = 1; + + if ($split_by_chromosome){ ## writing output to 1 file per chromosome + my $chromosome_out = $cytosine_out; + if ($chromosome_out =~ /txt$/){ # files passed on by the methylation extractor end in _report.txt + $chromosome_out =~ s/txt$/chr${chr}.txt/; + } + else{ # user specified output file name + $chromosome_out =~ s/$/.chr${chr}.txt/; + } + + open (CYT,'>',$chromosome_out) or die $!; + # warn "Writing output for $last_chr to $chromosome_out\n"; + } + + my $tri_nt; + my $tetra_nt; + my $penta_nt; + my $context; + + while ( $chromosomes{$chr} =~ /([CG])/g){ + + $tri_nt = ''; + $context = ''; + if ($tetra){ + $tetra_nt = ''; # clearing + $penta_nt = ''; + } + + my $pos = pos$chromosomes{$chr}; + + my $strand; + my $meth = 0; + my $nonmeth = 0; + + if ($1 eq 'C'){ # C on forward strand + $tri_nt = substr ($chromosomes{$chr},($pos-1),3); # positions are 0-based! + $strand = '+'; + if ($tetra){ + if ( length($chromosomes{$chr}) >= ($pos - 1 + 4) ){ + $tetra_nt = substr ($chromosomes{$chr},($pos-1),4); + } + else{ + $tetra_nt = ''; + } + if ( length($chromosomes{$chr}) >= ($pos - 1 + 5) ){ + $penta_nt = substr ($chromosomes{$chr},($pos-1),5); + } + else{ + $penta_nt = ''; + } + } + } + elsif ($1 eq 'G'){ # C on reverse strand + $tri_nt = substr ($chromosomes{$chr},($pos-3),3); # positions are 0-based! + $tri_nt = reverse $tri_nt; + $tri_nt =~ tr/ACTG/TGAC/; + $strand = '-'; + + if ($tetra){ + if ( $pos - 4 >= 0 ){ + $tetra_nt = substr ($chromosomes{$chr},($pos-4),4); + $tetra_nt = reverse $tetra_nt; + $tetra_nt =~ tr/ACTG/TGAC/; + } + else{ + $tetra_nt = ''; + } + if ( $pos - 5 >= 0 ){ + $penta_nt = substr ($chromosomes{$chr},($pos-5),5); + $penta_nt = reverse $penta_nt; + $penta_nt =~ tr/ACTG/TGAC/; + } + else{ + $penta_nt = ''; + } + } + $strand = '-'; + } + + next if (length$tri_nt < 3); # trinucleotide sequence could not be extracted + + ### determining cytosine context + if ($tri_nt =~ /^CG/){ + $context = 'CG'; + } + elsif ($tri_nt =~ /^C.{1}G$/){ + $context = 'CHG'; + } + elsif ($tri_nt =~ /^C.{2}$/){ + $context = 'CHH'; + } + else{ # if the context can't be determined the positions will not be printed (it will equally not have been reported by Bismark) + warn "The cytosine context could not be determined (found: '$tri_nt'). Skipping.\n"; + next; + } + + if ($CpG_only){ + if ($tri_nt =~ /^CG/){ # CpG context is the default + if ($zero){ # zero-based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + else{ ## all cytosines, specified with --CX + if ($zero){ # zero based coordinates + $pos -= 1; + if ($tetra){ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + else{ # default + if ($tetra){ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt,$tetra_nt,$penta_nt),"\n"; + } + else{ + print CYT join ("\t",$chr,$pos,$strand,$meth,$nonmeth,$context,$tri_nt),"\n"; + } + } + } + } + + # close CYT or warn $!; + +} + + +sub read_genome_into_memory{ + + ## reading in and storing the specified genome in the %chromosomes hash + chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; + warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; + + my @chromosome_filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@chromosome_filenames){ + @chromosome_filenames = <*.fasta>; + } + unless (@chromosome_filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; + } + + foreach my $chromosome_filename (@chromosome_filenames){ + + # skipping the tophat entire mouse genome fasta file + next if ($chromosome_filename eq 'Mus_musculus.NCBIM37.fa'); + + open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; + ### first line needs to be a fastA header + my $first_line = ; + chomp $first_line; + $first_line =~ s/\r//; # removing /r carriage returns + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + my $sequence; + while (){ + chomp; + $_ =~ s/\r//; # removing /r carriage returns + + if ($_ =~ /^>/){ + ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) + if (exists $chromosomes{$chromosome_name}){ + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else { + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; + } + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed + } + ### resetting the sequence variable + $sequence = ''; + ### setting new chromosome name + $chromosome_name = extract_chromosome_name($_); + } + else{ + $sequence .= uc$_; + } + } + + if (exists $chromosomes{$chromosome_name}){ + warn "chr $chromosome_name (",length $sequence ," bp)\t"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; + } + else{ + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; + } + warn "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + $processed{$chromosome_name} = 0; # processed chromosomes will be set to 1 later to allow a record of which chromosome has been processed + } + } + warn "\n"; + chdir $parent_dir or die "Failed to move to directory $parent_dir\n"; +} + +sub extract_chromosome_name { + ## Bowtie extracts the first string after the inition > in the FASTA file, so we are doing this as well + my $fasta_header = shift; + if ($fasta_header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$fasta_header); + return $chromosome_name; + } + else{ + die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; + } +} + + +sub print_helpfile{ + + warn < + + The coordinates of the input file are expected to be 1-based throughout (do not use files ending in .zero.cov!). + + + USAGE: coverage2cytosine [options] --genome_folder -o [input] + + +-o/--output Name of the output file, mandatory. + +--dir Output directory. Output is written to the current directory if not specified explicitly. + +--genome_folder Enter the genome folder you wish to use to extract sequences from (full path only). Accepted + formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. + +-CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of + its context. This applies to both forward and reverse strands. Please be aware that this will + generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. + Default: OFF (i.e. Default = CpG context only). + +--merge_CpG Using this option will post-process the genome-wide report to write out an additional coverage + file (see above for the coverage file format) which has the top and bottom strand methylation + evidence pooled into a single CpG dinucleotide entity. This may be the desirable input format + for some downstream processing tools such as the R-package bsseq (by K.D. Hansen). An example would be: + + genome-wide CpG report (old) + gi|9626372|ref|NC_001422.1| 157 + 313 156 CG + gi|9626372|ref|NC_001422.1| 158 - 335 156 CG + merged CpG evidence coverage file (new) + gi|9626372|ref|NC_001422.1| 157 158 67.500000 648 312 + + This option is currently experimental, and only works if CpG context only and a single genome-wide report + were specified (i.e. it doesn't work with the options --CX or --split_by_chromosome). + +--gc/--gc_context In addition to normal processing this option will reprocess the genome to find methylation in + GpC context. This might be useful for specialist applications where GpC methylases had been + deployed. The output format is exactly the same as for the normal cytosine report, and only + positions covered by at least one read are reported (output file ends in .GpC_report.txt). In addition + this will write out a Bismark coverage file (ending in GpC.cov). + +--ff In addition to the standard output selecting --ff will also extract a four and five (tetra/penta) + nucleotide context for the cytosines in question. Too short sequences (e.g. at the edges of the + chromosome) will be left blank; sequences containing Ns are ignored. + +--zero_based Uses 0-based coordinates instead of 1-based coordinates throughout. Default: OFF. + +--split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files + will be named to include the input filename and the chromosome number. + +--gzip Output file will be GZIP compressed (ending in .gz). Only works for standard CpG- and CX-output. + +--help Displays this help message and exits + + + +OUTPUT FORMAT: + +The genome-wide cytosine methylation output file is tab-delimited in the following format (1-based coords): +=========================================================================================================== + + + + + Script last modified: 04 April 2016 + +EOF + ; + exit 1; +} + diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_methylation_extractor --- a/bismark_methylation_extractor Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4760 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; -$|++; -use Getopt::Long; -use Cwd; -use Carp; -use FindBin qw($Bin); -use lib "$Bin/../lib"; - - -## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) - -## This program is free software: you can redistribute it and/or modify -## it under the terms of the GNU General Public License as published by -## the Free Software Foundation, either version 3 of the License, or -## (at your option) any later version. - -## This program is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. - -## You should have received a copy of the GNU General Public License -## along with this program. If not, see . - -my @filenames; # input files -my %counting; -my $parent_dir = getcwd(); - -my %fhs; - -my $version = 'v0.10.1'; -my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem) = process_commandline(); - - -### only needed for bedGraph output -my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files -my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total -my @bedfiles; - -### only needed for genome-wide cytosine methylation report -my %chromosomes; - -my %mbias_1; -my %mbias_2; - -############################################################################################## -### Summarising Run Parameters -############################################################################################## - -### METHYLATION EXTRACTOR - -warn "Summarising Bismark methylation extractor parameters:\n"; -warn '='x63,"\n"; - -if ($single){ - if ($vanilla){ - warn "Bismark single-end vanilla format specified\n"; - } - else{ - warn "Bismark single-end SAM format specified (default)\n"; # default - } -} -elsif ($paired){ - if ($vanilla){ - warn "Bismark paired-end vanilla format specified\n"; - } - else{ - warn "Bismark paired-end SAM format specified (default)\n"; # default - } -} - -if ($single){ - if ($ignore){ - warn "First $ignore bp will be disregarded when processing the methylation call string\n"; - } -} -else{ ## paired-end - if ($ignore){ - warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n"; - } - if ($ignore_r2){ - warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; - } -} - - -if ($full){ - warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; -} -if ($merge_non_CpG){ - warn "Merge CHG and CHH context to non-CpG context specified\n"; -} -### output directory -if ($output_dir eq ''){ - warn "Output will be written to the current directory ('$parent_dir')\n"; -} -else{ - warn "Output path specified as: $output_dir\n"; -} - - -sleep (1); - -### BEDGRAPH - -if ($bedGraph){ - warn "\n\nSummarising bedGraph parameters:\n"; - warn '='x63,"\n"; - - if ($counts){ - warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t \ncoverage format:\t \n\n"; - } - else{ - warn "Generating additional sorted output in bedGraph format (output format: )\n"; - } - - warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; - - if ($CX_context){ - warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; - } - else{ # default - $CpG_only = 1; - warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; - } - - if ($remove){ - warn "White spaces in read ID names will be removed prior to sorting\n"; - } - - if ($ample_mem){ - warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n"; - } - elsif (defined $sort_size){ - warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; - } - else{ - warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; - } - - - - sleep (1); - - if ($cytosine_report){ - warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; - warn '='x63,"\n"; - warn "Generating comprehensive genome-wide cytosine report\n(output format: )\n"; - - - if ($CX_context){ - warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; - } - else{ # default - $CpG_only = 1; - warn "Reporting cytosine methylation in CpG context only (default)\n"; - } - - if ($split_by_chromosome){ - warn "Splitting the cytosine report output up into individual files for each chromosome\n"; - } - - ### Zero-based coordinates - if ($zero){ - warn "Using zero-based genomic coordinates (user-defined)\n"; - } - else{ # default, 1-based coords - warn "Using 1-based genomic coordinates (default)\n"; - } - - ### GENOME folder - if ($genome_folder){ - unless ($genome_folder =~/\/$/){ - $genome_folder =~ s/$/\//; - } - warn "Genome folder was specified as $genome_folder\n"; - } - else{ - $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; - warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; - } - sleep (1); - } -} - -warn "\n"; -sleep (5); - -###################################################### -### PROCESSING FILES -###################################################### - -foreach my $filename (@filenames){ - # resetting counters and filehandles - %fhs = (); - %counting =( - total_meCHG_count => 0, - total_meCHH_count => 0, - total_meCpG_count => 0, - total_unmethylated_CHG_count => 0, - total_unmethylated_CHH_count => 0, - total_unmethylated_CpG_count => 0, - sequences_count => 0, - ); - - @sorting_files = (); - @bedfiles = (); - - %mbias_1 = (); - %mbias_2 = (); - - ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor - unless ($vanilla){ - if ($paired){ - test_positional_sorting($filename); - } - } - - process_Bismark_results_file($filename); - - ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues - foreach my $fh (keys %fhs) { - if ($fh =~ /^[1230]$/) { - foreach my $context (keys %{$fhs{$fh}}) { - close $fhs{$fh}->{$context} or die $!; - } - } - else{ - close $fhs{$fh} or die $!; - } - } - - ### printing out all M-Bias data - produce_mbias_plots ($filename); - - delete_unused_files(); - - if ($bedGraph){ - - my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified - $out =~ s/gz$//; - $out =~ s/sam$//; - $out =~ s/bam$//; - $out =~ s/txt$//; - $out =~ s/$/bedGraph/; - - my $bedGraph_output = $out; - my @args; - - if ($remove){ - push @args, '--remove'; - } - if ($CX_context){ - push @args, '--CX_context'; - } - if ($no_header){ - push @args, '--no_header'; - } - if ($gazillion){ - push @args, '--gazillion'; - } - if ($ample_mem){ - push @args, '--ample_memory'; - } - - - # if ($counts){ - # push @args, "--counts"; - # } - - push @args, "--buffer_size $sort_size"; - push @args, "--cutoff $coverage_threshold"; - push @args, "--output $bedGraph_output"; - push @args, "--dir '$output_dir'"; - - ### adding all files to be sorted to @args - foreach my $f (@sorting_files){ - push @args, $f; - } - - # print join "\t",@args,"\n"; - - system ("$Bin/bismark2bedGraph @args"); - - warn "Finished BedGraph conversion ...\n\n"; - sleep(3); - - # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!"; - # warn "Writing bedGraph to file: $bedGraph_output\n"; - # process_bedGraph_output(); - # close OUT or die $!; - - ### genome-wide cytosine methylation report requires bedGraph processing anyway - if ($cytosine_report){ - - @args = (); # resetting @args - my $cytosine_out = $out; - $cytosine_out =~ s/bedGraph$//; - - if ($CX_context){ - $cytosine_out =~ s/$/CX_report.txt/; - } - else{ - $cytosine_out =~ s/$/CpG_report.txt/; - } - - push @args, "--output $cytosine_out"; - push @args, "--dir '$output_dir'"; - push @args, "--genome '$genome_folder'"; - push @args, "--parent_dir '$parent_dir'"; - - if ($zero){ - push @args, "--zero"; - } - if ($CX_context){ - push @args, '--CX_context'; - } - if ($split_by_chromosome){ - push @args, '--split_by_chromosome'; - } - - my $coverage_output = $bedGraph_output; - $coverage_output =~ s/bedGraph$/bismark.cov/; - - push @args, $output_dir . $coverage_output; # this will be the infile - - system ("$Bin/coverage2cytosine @args"); - # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); - warn "\n\nFinished generating genome-wide cytosine report\n\n"; - } - } -} - -sub delete_unused_files{ - - warn "Deleting unused files ...\n\n"; sleep(1); - - my $index = 0; - - while ($index <= $#sorting_files){ - if ($sorting_files[$index] =~ /gz$/){ - open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; - } - else{ - open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; - } - - my $used = 0; - - while (){ - next if (/^Bismark/); - if ($_){ - $used = 1; - last; - } - } - - if ($used){ - warn "$sorting_files[$index] contains data ->\tkept\n"; - ++$index; - } - else{ - - my $delete = unlink $sorting_files[$index]; - - if ($delete){ - warn "$sorting_files[$index] was empty ->\tdeleted\n"; - } - else{ - warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n" - } - - ### we also need to remove the element from @sorting_files - splice @sorting_files, $index, 1; - } - } - warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately) -} - -sub produce_mbias_plots{ - - my $filename = shift; - - my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified - $mbias =~ s/gz$//; - $mbias =~ s/sam$//; - $mbias =~ s/bam$//; - $mbias =~ s/txt$//; - my $mbias_graph_1 = my $mbias_graph_2 = $mbias; - $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png'; - $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png'; - - $mbias =~ s/$/M-bias.txt/; - - open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n"; - - # determining maximum read length - my $max_length_1 = 0; - my $max_length_2 = 0; - - foreach my $context (keys %mbias_1){ - foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ - $max_length_1 = $pos unless ($max_length_1 >= $pos); - } - } - if ($paired){ - foreach my $context (keys %mbias_2){ - foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ - $max_length_2 = $pos unless ($max_length_2 >= $pos); - } - } - } - - if ($single){ - warn "Determining maximum read length for M-Bias plot\n"; - warn "Maximum read length of Read 1: $max_length_1\n\n"; - } - else{ - warn "Determining maximum read lengths for M-Bias plots\n"; - warn "Maximum read length of Read 1: $max_length_1\n"; - warn "Maximum read length of Read 2: $max_length_2\n\n"; - } - # sleep(3); - - my @mbias_read1; - my @mbias_read2; - - #Check whether the module GD::Graph:lines is installed - my $gd_graph_installed = 0; - eval{ - require GD::Graph::lines; - GD::Graph::lines->import(); - }; - - unless($@) { # syntax or routine error variable, set if something goes wron in the last eval{ require ...} - $gd_graph_installed = 1; - - #Check whether the module GD::Graph::colour is installed - eval{ - require GD::Graph::colour; - GD::Graph::colour->import(qw(:colours :lists :files :convert)); - }; - - if ($@) { - warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; - sleep(2); - $gd_graph_installed = 0; - } - - - } - else{ - warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; - sleep(2); - } - - - my $graph_title; - my $graph1; - my $graph2; - - if ( $gd_graph_installed){ - $graph1 = GD::Graph::lines->new(800,600); - if ($paired){ - $graph2 = GD::Graph::lines->new(800,600); - } - } - - foreach my $context (qw(CpG CHG CHH)){ - @{$mbias_read1[0]} = (); - - if ($paired){ - print MBIAS "$context context (R1)\n================\n"; - $graph_title = 'M-bias (Read 1)'; - } - else{ - print MBIAS "$context context\n===========\n"; - $graph_title = 'M-bias'; - } - print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; - - foreach my $pos (1..$max_length_1){ - - unless (defined $mbias_1{$context}->{$pos}->{meth}){ - $mbias_1{$context}->{$pos}->{meth} = 0; - } - unless (defined $mbias_1{$context}->{$pos}->{un}){ - $mbias_1{$context}->{$pos}->{un} = 0; - } - - my $percent = ''; - if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ - $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); - } - my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; - - print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; - push @{$mbias_read1[0]},$pos; - - if ($context eq 'CpG'){ - push @{$mbias_read1[1]},$percent; - push @{$mbias_read1[4]},$coverage; - } - elsif ($context eq 'CHG'){ - push @{$mbias_read1[2]},$percent; - push @{$mbias_read1[5]},$coverage; - } - elsif ($context eq 'CHH'){ - push @{$mbias_read1[3]},$percent; - push @{$mbias_read1[6]},$coverage; - } - } - print MBIAS "\n"; - } - - if ( $gd_graph_installed){ - - add_colour(nice_blue => [31,120,180]); - add_colour(nice_orange => [255,127,0]); - add_colour(nice_green => [51,160,44]); - add_colour(pale_blue => [153,206,227]); - add_colour(pale_orange => [253,204,138]); - add_colour(pale_green => [191,230,207]); - - $graph1->set( - x_label => 'position (bp)', - y1_label => '% methylation', - y2_label => '# methylation calls', - title => $graph_title, - line_width => 2, - x_max_value => $max_length_1, - x_min_value => 0, - y_tick_number => 10, - y_label_skip => 2, - y1_max_value => 100, - y1_min_value => 0, - y_label_skip => 2, - y2_min_value => 0, - x_label_skip => 5, - x_label_position => 0.5, - x_tick_offset => -1, - bgclr => 'white', - transparent => 0, - two_axes => 1, - use_axis => [1,1,1,2,2,2], - legend_placement => 'RC', - legend_spacing => 6, - legend_marker_width => 24, - legend_marker_height => 18, - dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], - ) or die $graph1->error; - - $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); - - my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error; - - open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; - binmode MBIAS_G1; - print MBIAS_G1 $gd1->png; - } - - if ($paired){ - - foreach my $context (qw(CpG CHG CHH)){ - @{$mbias_read2[0]} = (); - - print MBIAS "$context context (R2)\n================\n"; - print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; - foreach my $pos (1..$max_length_2){ - - unless (defined $mbias_2{$context}->{$pos}->{meth}){ - $mbias_2{$context}->{$pos}->{meth} = 0; - } - unless (defined $mbias_2{$context}->{$pos}->{un}){ - $mbias_2{$context}->{$pos}->{un} = 0; - } - - my $percent = ''; - if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ - $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); - } - my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; - - print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; - - push @{$mbias_read2[0]},$pos; - - if ($context eq 'CpG'){ - push @{$mbias_read2[1]},$percent; - push @{$mbias_read2[4]},$coverage; - } - elsif ($context eq 'CHG'){ - push @{$mbias_read2[2]},$percent; - push @{$mbias_read2[5]},$coverage; - } - elsif ($context eq 'CHH'){ - push @{$mbias_read2[3]},$percent; - push @{$mbias_read2[6]},$coverage; - } - } - print MBIAS "\n"; - } - - if ( $gd_graph_installed){ - - add_colour(nice_blue => [31,120,180]); - add_colour(nice_orange => [255,127,0]); - add_colour(nice_green => [51,160,44]); - add_colour(pale_blue => [153,206,227]); - add_colour(pale_orange => [253,204,138]); - add_colour(pale_green => [191,230,207]); - - $graph2->set( - x_label => 'position (bp)', - line_width => 2, - x_max_value => $max_length_1, - x_min_value => 0, - y_tick_number => 10, - y_label_skip => 2, - y1_max_value => 100, - y1_min_value => 0, - y_label_skip => 2, - y2_min_value => 0, - x_label_skip => 5, - x_label_position => 0.5, - x_tick_offset => -1, - bgclr => 'white', - transparent => 0, - two_axes => 1, - use_axis => [1,1,1,2,2,2], - legend_placement => 'RC', - legend_spacing => 6, - legend_marker_width => 24, - legend_marker_height => 18, - dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], - x_label => 'position (bp)', - y1_label => '% methylation', - y2_label => '# calls', - title => 'M-bias (Read 2)', - ) or die $graph2->error; - - $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); - my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error; - - open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; - binmode MBIAS_G2; - print MBIAS_G2 $gd2->png; - - } - } -} - -sub process_commandline{ - my $help; - my $single_end; - my $paired_end; - my $ignore; - my $ignore_r2; - my $genomic_fasta; - my $full; - my $report; - my $extractor_version; - my $no_overlap; - my $merge_non_CpG; - my $vanilla; - my $output_dir; - my $no_header; - my $bedGraph; - my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status - my $remove; - my $counts; - my $cytosine_report; - my $genome_folder; - my $zero; - my $CpG_only; - my $CX_context; - my $split_by_chromosome; - my $sort_size; - my $samtools_path; - my $gzip; - my $mbias_only; - my $gazillion; - my $ample_mem; - - my $command_line = GetOptions ('help|man' => \$help, - 'p|paired-end' => \$paired_end, - 's|single-end' => \$single_end, - 'fasta' => \$genomic_fasta, - 'ignore=i' => \$ignore, - 'ignore_r2=i' => \$ignore_r2, - 'comprehensive' => \$full, - 'report' => \$report, - 'version' => \$extractor_version, - 'no_overlap' => \$no_overlap, - 'merge_non_CpG' => \$merge_non_CpG, - 'vanilla' => \$vanilla, - 'o|output=s' => \$output_dir, - 'no_header' => \$no_header, - 'bedGraph' => \$bedGraph, - "cutoff=i" => \$coverage_threshold, - "remove_spaces" => \$remove, - "counts" => \$counts, - "cytosine_report" => \$cytosine_report, - 'g|genome_folder=s' => \$genome_folder, - "zero_based" => \$zero, - "CX|CX_context" => \$CX_context, - "split_by_chromosome" => \$split_by_chromosome, - "buffer_size=s" => \$sort_size, - 'samtools_path=s' => \$samtools_path, - "gzip" => \$gzip, - "mbias_only" => \$mbias_only, - "gazillion|scaffolds" => \$gazillion, - "ample_memory" => \$ample_mem, - ); - - ### EXIT ON ERROR if there were errors with any of the supplied options - unless ($command_line){ - die "Please respecify command line options\n"; - } - - ### HELPFILE - if ($help){ - print_helpfile(); - exit; - } - - if ($extractor_version){ - print << "VERSION"; - - - Bismark Methylation Extractor - - Bismark Extractor Version: $version - Copyright 2010-13 Felix Krueger, Babraham Bioinformatics - www.bioinformatics.babraham.ac.uk/projects/bismark/ - - -VERSION - exit; - } - - - ### no files provided - unless (@ARGV){ - die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; - } - @filenames = @ARGV; - - warn "\n *** Bismark methylation extractor version $version ***\n\n"; - - ### M-BIAS ONLY - if ($mbias_only){ - if ($bedGraph){ - die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n"; - } - if ($cytosine_report){ - die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n"; - } - if ($merge_non_CpG){ - warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n"; - } - if ($full){ - warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n"; - } - sleep(3); - } - - ### PRINT A REPORT - unless ($report){ - $report = 0; - } - - ### OUTPUT DIR PATH - if ($output_dir){ - unless ($output_dir =~ /\/$/){ - $output_dir =~ s/$/\//; - } - } - else{ - $output_dir = ''; - } - - ### NO HEADER - unless ($no_header){ - $no_header = 0; - } - - ### OLD (VANILLA) OUTPUT FORMAT - unless ($vanilla){ - $vanilla = 0; - } - - if ($single_end){ - $paired_end = 0; ### SINGLE END ALIGNMENTS - } - elsif ($paired_end){ - $single_end = 0; ### PAIRED-END ALIGNMENTS - } - else{ - - ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header - - if ($vanilla){ - die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n"; - } - else{ # SAM/BAM format - - my $file = $filenames[0]; - warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1); - - ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file - if ($file =~ /\.gz$/){ - open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n"; - } - elsif ($file =~ /\.bam$/ || `file -b $file` =~ /^gzip/){ - open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n"; - } - else{ - open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; - } - - while (){ - last unless (/^\@/); - if ($_ =~ /^\@PG/){ - # warn "found the \@PG line:\n"; - # warn "$_"; - - if ($_ =~ /-1/ and $_ =~ /-2/){ - warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1); - $paired_end = 1; - $single_end = 0; - } - else{ - warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1); - $paired_end = 0; - $single_end = 1; - } - } - } - - close DETERMINE or warn $!; - - } - } - - ### IGNORING bases at the start of the read when processing the methylation call string - unless ($ignore){ - $ignore = 0; - } - - if (defined $ignore_r2){ - die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end); - } - else{ - $ignore_r2 = 0; - } - - - ### NO OVERLAP - if ($no_overlap){ - die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); - } - else{ - $no_overlap = 0; - } - - ### COMPREHENSIVE OUTPUT - unless ($full){ - $full = 0; - } - - ### MERGE NON-CpG context - unless ($merge_non_CpG){ - $merge_non_CpG = 0; - } - - ### remove white spaces in read ID (needed for sorting using the sort command - unless ($remove){ - $remove = 0; - } - - ### COVERAGE THRESHOLD FOR bedGraph OUTPUT - if (defined $coverage_threshold){ - unless ($coverage_threshold > 0){ - die "Please select a coverage greater than 0 (positive integers only)\n"; - } - } - else{ - $coverage_threshold = 1; - } - - ### SORT buffer size - if (defined $sort_size){ - unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ - die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; - } - } - else{ - $sort_size = '2G'; - } - - if ($zero){ - die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report); - } - - if ($CX_context){ - die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); - } - else{ - $CX_context = 0; - } - - unless ($counts){ - $counts = 1; # counts will always be set - } - - if ($cytosine_report){ - - ### GENOME folder - if ($genome_folder){ - unless ($genome_folder =~/\/$/){ - $genome_folder =~ s/$/\//; - } - } - else{ - die "Please specify a genome folder to proceed (full path only)\n"; - } - - unless ($bedGraph){ - warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; - $bedGraph = 1; - } - unless ($counts){ - # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; - $counts = 1; - } - warn "\n"; - } - - ### PATH TO SAMTOOLS - if (defined $samtools_path){ - # if Samtools was specified as full command - if ($samtools_path =~ /samtools$/){ - if (-e $samtools_path){ - # Samtools executable found - } - else{ - die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; - } - } - else{ - unless ($samtools_path =~ /\/$/){ - $samtools_path =~ s/$/\//; - } - $samtools_path .= 'samtools'; - if (-e $samtools_path){ - # Samtools executable found - } - else{ - die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; - } - } - } - # Check whether Samtools is in the PATH if no path was supplied by the user - else{ - if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH - $samtools_path = `which samtools`; - chomp $samtools_path; - } - } - - unless (defined $samtools_path){ - $samtools_path = ''; - } - - - if ($gazillion){ - if ($ample_mem){ - die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; - } - } - - return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem); -} - - -sub test_positional_sorting{ - - my $filename = shift; - - print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; - sleep(1); - - if ($filename =~ /\.gz$/) { - open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; - } - elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) { - if ($samtools_path){ - open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; - } - else{ - die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; - } - } - else { - open (TEST,$filename) or die "Can't open file $filename: $!\n"; - } - - my $count = 0; - - while () { - if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) - if (/^\@SO/) { - die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n"; - } - next; - } - $count++; - - last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID - - my ($id_1) = (split (/\t/)); - - ### reading the next line which should be read 2 - $_ = ; - my ($id_2) = (split (/\t/)); - last unless ($id_2); - ++$count; - - if ($id_1 eq $id_2){ - ### ids are the same - next; - } - else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first - my $id_1_trunc = $id_1; - $id_1_trunc =~ s/\/1$//; - my $id_2_trunc = $id_2; - $id_2_trunc =~ s/\/2$//; - - unless ($id_1_trunc eq $id_2_trunc){ - die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n"; - } - } - } - # close TEST or die $!; somehow fails on our cluster... - ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) - warn "...passed!\n"; - sleep(1); - -} - - -sub process_Bismark_results_file{ - my $filename = shift; - - warn "\nNow reading in Bismark result file $filename\n\n"; - - if ($filename =~ /\.gz$/) { - open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; - } - elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) { - if ($samtools_path){ - open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; - } - else{ - die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; - } - } - else { - open (IN,$filename) or die "Can't open file $filename: $!\n"; - } - - ### Vanilla and SAM output need to read different numbers of header lines - if ($vanilla) { - my $bismark_version = ; ## discarding the Bismark version info - chomp $bismark_version; - $bismark_version =~ s/\r//; # replaces \r line feed - $bismark_version =~ s/Bismark version: //; - if ($bismark_version =~ /^\@/) { - warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; - sleep (2); - } - - unless ($version eq $bismark_version){ - die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; - } - } else { - # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. - # We are reading from it further down - } - - my $output_filename = (split (/\//,$filename))[-1]; - - ### OPENING OUTPUT-FILEHANDLES - if ($report) { - my $report_filename = $output_filename; - $report_filename =~ s/\.sam$//; - $report_filename =~ s/\.txt$//; - $report_filename =~ s/$/_splitting_report.txt/; - $report_filename = $output_dir . $report_filename; - open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; - } - - if ($report) { - print REPORT "$output_filename\n\n"; - print REPORT "Parameters used to extract methylation information:\n"; - if ($paired) { - if ($vanilla) { - print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; - } else { - print REPORT "Bismark result file: paired-end (SAM format)\n"; # default - } - } - - if ($single) { - if ($vanilla) { - print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; - } else { - print REPORT "Bismark result file: single-end (SAM format)\n"; # default - } - } - if ($single){ - if ($ignore) { - print REPORT "Ignoring first $ignore bp\n"; - } - } - else{ # paired-end - if ($ignore) { - print REPORT "Ignoring first $ignore bp of Read 1\n"; - } - if ($ignore_r2){ - print REPORT "Ignoring first $ignore_r2 bp of Read 2\n"; - } - } - - if ($full) { - print REPORT "Output specified: comprehensive\n"; - } else { - print REPORT "Output specified: strand-specific (default)\n"; - } - - if ($no_overlap) { - print REPORT "No overlapping methylation calls specified\n"; - } - if ($genomic_fasta) { - print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; - } - if ($merge_non_CpG) { - print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; - } - - print REPORT "\n"; - } - -##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; - - ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL - ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file - if ($full and $merge_non_CpG) { - my $cpg_output = my $other_c_output = $output_filename; - ### C in CpG context - $cpg_output =~ s/^/CpG_context_/; - $cpg_output =~ s/sam$/txt/; - $cpg_output =~ s/bam$/txt/; - $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); - $cpg_output = $output_dir . $cpg_output; - - if ($gzip){ - $cpg_output .= '.gz'; - open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); - } - else{ - open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); - push @sorting_files,$cpg_output; - - unless ($no_header) { - print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - ### C in any other context than CpG - $other_c_output =~ s/^/Non_CpG_context_/; - $other_c_output =~ s/sam$/txt/; - $other_c_output =~ s/bam$/txt/; - $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); - $other_c_output = $output_dir . $other_c_output; - - if ($gzip){ - $other_c_output .= '.gz'; - open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only); - } - else{ - open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only); - push @sorting_files,$other_c_output; - - - unless ($no_header) { - print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - } - - ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found - elsif ($merge_non_CpG) { - - my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; - - ### For cytosines in CpG context - $cpg_ot =~ s/^/CpG_OT_/; - $cpg_ot =~ s/sam$/txt/; - $cpg_ot =~ s/bam$/txt/; - $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); - $cpg_ot = $output_dir . $cpg_ot; - - if ($gzip){ - $cpg_ot .= '.gz'; - open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); - } - else{ - open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); - push @sorting_files,$cpg_ot; - - unless($no_header){ - print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $cpg_ctot =~ s/^/CpG_CTOT_/; - $cpg_ctot =~ s/sam$/txt/; - $cpg_ctot =~ s/bam$/txt/; - $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); - $cpg_ctot = $output_dir . $cpg_ctot; - - if ($gzip){ - $cpg_ctot .= '.gz'; - open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); - } - else{ - open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); - push @sorting_files,$cpg_ctot; - - unless($no_header){ - print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $cpg_ctob =~ s/^/CpG_CTOB_/; - $cpg_ctob =~ s/sam$/txt/; - $cpg_ctob =~ s/bam$/txt/; - $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); - $cpg_ctob = $output_dir . $cpg_ctob; - - if ($gzip){ - $cpg_ctob .= '.gz'; - open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); - } - else{ - open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); - push @sorting_files,$cpg_ctob; - - unless($no_header){ - print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $cpg_ob =~ s/^/CpG_OB_/; - $cpg_ob =~ s/sam$/txt/; - $cpg_ob =~ s/bam$/txt/; - $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); - $cpg_ob = $output_dir . $cpg_ob; - - if ($gzip){ - $cpg_ob .= '.gz'; - open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); - } - else{ - open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); - push @sorting_files,$cpg_ob; - - unless($no_header){ - print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - ### For cytosines in Non-CpG (CC, CT or CA) context - my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; - - $other_c_ot =~ s/^/Non_CpG_OT_/; - $other_c_ot =~ s/sam$/txt/; - $other_c_ot =~ s/bam$/txt/; - $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); - $other_c_ot = $output_dir . $other_c_ot; - - if ($gzip){ - $other_c_ot .= '.gz'; - open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); - } - else{ - open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only); - push @sorting_files,$other_c_ot; - - unless($no_header){ - print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $other_c_ctot =~ s/^/Non_CpG_CTOT_/; - $other_c_ctot =~ s/sam$/txt/; - $other_c_ctot =~ s/bam$/txt/; - $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); - $other_c_ctot = $output_dir . $other_c_ctot; - - if ($gzip){ - $other_c_ctot .= '.gz'; - open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); - } - else{ - open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only); - push @sorting_files,$other_c_ctot; - - unless($no_header){ - print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $other_c_ctob =~ s/^/Non_CpG_CTOB_/; - $other_c_ctob =~ s/sam$/txt/; - $other_c_ctob =~ s/bam$/txt/; - $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); - $other_c_ctob = $output_dir . $other_c_ctob; - - if ($gzip){ - $other_c_ctob .= '.gz'; - open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); - } - else{ - open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only); - push @sorting_files,$other_c_ctob; - - unless($no_header){ - print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $other_c_ob =~ s/^/Non_CpG_OB_/; - $other_c_ob =~ s/sam$/txt/; - $other_c_ob =~ s/sam$/txt/; - $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); - $other_c_ob = $output_dir . $other_c_ob; - - if ($gzip){ - $other_c_ob .= '.gz'; - open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); - } - else{ - open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only); - push @sorting_files,$other_c_ob; - - unless($no_header){ - print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - } - ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) - - ### if --comprehensive was specified we are only writing one file per context - elsif ($full) { - my $cpg_output = my $chg_output = my $chh_output = $output_filename; - ### C in CpG context - $cpg_output =~ s/^/CpG_context_/; - $cpg_output =~ s/sam$/txt/; - $cpg_output =~ s/bam$/txt/; - $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); - $cpg_output = $output_dir . $cpg_output; - - if ($gzip){ - $cpg_output .= '.gz'; - open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); - } - else{ - open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); - push @sorting_files,$cpg_output; - - unless($no_header){ - print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - ### C in CHG context - $chg_output =~ s/^/CHG_context_/; - $chg_output =~ s/sam$/txt/; - $chg_output =~ s/bam$/txt/; - $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); - $chg_output = $output_dir . $chg_output; - - if ($gzip){ - $chg_output .= '.gz'; - open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); - } - else{ - open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only); - push @sorting_files,$chg_output; - - unless($no_header){ - print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - ### C in CHH context - $chh_output =~ s/^/CHH_context_/; - $chh_output =~ s/sam$/txt/; - $chh_output =~ s/bam$/txt/; - $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); - $chh_output = $output_dir . $chh_output; - - if ($gzip){ - $chh_output .= '.gz'; - open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); - } - else{ - open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only); - push @sorting_files, $chh_output; - - unless($no_header){ - print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - } - ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found - else { - my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; - - ### For cytosines in CpG context - $cpg_ot =~ s/^/CpG_OT_/; - $cpg_ot =~ s/sam$/txt/; - $cpg_ot =~ s/bam$/txt/; - $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); - $cpg_ot = $output_dir . $cpg_ot; - - if ($gzip){ - $cpg_ot .= '.gz'; - open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); - } - else{ - open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); - push @sorting_files,$cpg_ot; - - unless($no_header){ - print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $cpg_ctot =~ s/^/CpG_CTOT_/; - $cpg_ctot =~ s/sam$/txt/; - $cpg_ctot =~ s/bam$/txt/; - $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); - $cpg_ctot = $output_dir . $cpg_ctot; - - if ($gzip){ - $cpg_ctot .= '.gz'; - open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); - } - else{ - open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); - push @sorting_files,$cpg_ctot; - - unless($no_header){ - print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $cpg_ctob =~ s/^/CpG_CTOB_/; - $cpg_ctob =~ s/sam$/txt/; - $cpg_ctob =~ s/bam$/txt/; - $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); - $cpg_ctob = $output_dir . $cpg_ctob; - - if ($gzip){ - $cpg_ctob .= '.gz'; - open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); - } - else{ - open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); - push @sorting_files,$cpg_ctob; - - unless($no_header){ - print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $cpg_ob =~ s/^/CpG_OB_/; - $cpg_ob =~ s/sam$/txt/; - $cpg_ob =~ s/bam$/txt/; - $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); - $cpg_ob = $output_dir . $cpg_ob; - - if ($gzip){ - $cpg_ob .= '.gz'; - open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); - } - else{ - open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); - push @sorting_files,$cpg_ob; - - unless($no_header){ - print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - ### For cytosines in CHG context - my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; - - $chg_ot =~ s/^/CHG_OT_/; - $chg_ot =~ s/sam$/txt/; - $chg_ot =~ s/bam$/txt/; - $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); - $chg_ot = $output_dir . $chg_ot; - - if ($gzip){ - $chg_ot .= '.gz'; - open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); - } - else{ - open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only); - push @sorting_files,$chg_ot; - - unless($no_header){ - print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $chg_ctot =~ s/^/CHG_CTOT_/; - $chg_ctot =~ s/sam$/txt/; - $chg_ctot =~ s/bam$/txt/; - $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); - $chg_ctot = $output_dir . $chg_ctot; - - if ($gzip){ - $chg_ctot .= '.gz'; - open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); - } - else{ - open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only); - push @sorting_files,$chg_ctot; - - unless($no_header){ - print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $chg_ctob =~ s/^/CHG_CTOB_/; - $chg_ctob =~ s/sam$/txt/; - $chg_ctob =~ s/bam$/txt/; - $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); - $chg_ctob = $output_dir . $chg_ctob; - - if ($gzip){ - $chg_ctob .= '.gz'; - open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); - } - else{ - open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only); - push @sorting_files,$chg_ctob; - - unless($no_header){ - print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $chg_ob =~ s/^/CHG_OB_/; - $chg_ob =~ s/sam$/txt/; - $chg_ob =~ s/bam$/txt/; - $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); - $chg_ob = $output_dir . $chg_ob; - - if ($gzip){ - $chg_ob .= '.gz'; - open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); - } - else{ - open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only); - push @sorting_files,$chg_ob; - - unless($no_header){ - print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - ### For cytosines in CHH context - my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; - - $chh_ot =~ s/^/CHH_OT_/; - $chh_ot =~ s/sam$/txt/; - $chh_ot =~ s/bam$/txt/; - $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); - $chh_ot = $output_dir . $chh_ot; - - if ($gzip){ - $chh_ot .= '.gz'; - open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); - } - else{ - open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only); - push @sorting_files,$chh_ot; - - unless($no_header){ - print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $chh_ctot =~ s/^/CHH_CTOT_/; - $chh_ctot =~ s/sam$/txt/; - $chh_ctot =~ s/bam$/txt/; - $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); - $chh_ctot = $output_dir . $chh_ctot; - - if ($gzip){ - $chh_ctot .= '.gz'; - open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); - } - else{ - open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only); - push @sorting_files,$chh_ctot; - - unless($no_header){ - print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $chh_ctob =~ s/^/CHH_CTOB_/; - $chh_ctob =~ s/sam$/txt/; - $chh_ctob =~ s/bam$/txt/; - $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); - $chh_ctob = $output_dir . $chh_ctob; - - if ($gzip){ - $chh_ctob .= '.gz'; - open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); - } - else{ - open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only); - push @sorting_files,$chh_ctob; - - unless($no_header){ - print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - - $chh_ob =~ s/^/CHH_OB_/; - $chh_ob =~ s/sam$/txt/; - $chh_ob =~ s/bam$/txt/; - $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); - $chh_ob = $output_dir . $chh_ob; - - if ($gzip){ - $chh_ob .= '.gz'; - open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); - } - else{ - open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only); - } - - warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only); - push @sorting_files,$chh_ob; - - unless($no_header){ - print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); - } - } - - my $methylation_call_strings_processed = 0; - my $line_count = 0; - - ### proceeding differently now for single-end or paired-end Bismark files - - ### PROCESSING SINGLE-END RESULT FILES - if ($single) { - - ### also proceeding differently now for SAM format or vanilla Bismark format files - if ($vanilla) { # old vanilla Bismark output format - while () { - ++$line_count; - warn "Processed lines: $line_count\n" if ($line_count%500000==0); - - ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) - my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; - - ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or - ### last position - chomp $genome_conversion; - - my $index; - if ($meth_call) { - - if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand - $index = 0; - } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand - $index = 1; - } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand - $index = 3; - } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand - $index = 2; - } else { - die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; - } - - ### Clipping off the first number of bases from the methylation call string as specified with --ignore - if ($ignore) { - $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); - - ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! - if ($strand eq '+') { - $start += $ignore; - } elsif ($strand eq '-') { - $start += length($meth_call)-1; ## $meth_call is already shortened! - } else { - die "Alignment did not have proper strand information: $strand\n"; - } - } - ### printing out the methylation state of every C in the read - print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); - - ++$methylation_call_strings_processed; # 1 per single-end result - } - } - } else { # processing single-end SAM format (default) - while () { - ### SAM format can either start with header lines (starting with @) or start with alignments directly - if (/^\@/) { # skipping header lines (starting with @) - warn "skipping SAM header line:\t$_"; - next; - } - - ++$line_count; - warn "Processed lines: $line_count\n" if ($line_count%500000==0); - - # example read in SAM format - # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT - ### - - # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; - # < 0.7.6 $meth_call =~ s/^XM:Z://; - # < 0.7.6 $read_conversion =~ s/^XR:Z://; - # < 0.7.6 $genome_conversion =~ s/^XG:Z://; - - my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; - - ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression - my $meth_call; ### Thanks to Zachary Zeno for this solution - my $read_conversion; - my $genome_conversion; - - while ( /(XM|XR|XG):Z:([^\t]+)/g ) { - my $tag = $1; - my $value = $2; - - if ($tag eq "XM") { - $meth_call = $value; - $meth_call =~ s/\r//; - } elsif ($tag eq "XR") { - $read_conversion = $value; - $read_conversion =~ s/\r//; - } elsif ($tag eq "XG") { - $genome_conversion = $value; - $genome_conversion =~ s/\r//; - } - } - - my $strand; - chomp $genome_conversion; - # print "$meth_call\n$read_conversion\n$genome_conversion\n"; - - my $index; - if ($meth_call) { - if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand - $index = 0; - $strand = '+'; - } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand - $index = 1; - $strand = '-'; - } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand - $index = 2; - $strand = '+'; - } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand - $index = 3; - $strand = '-'; - } else { - die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; - } - - ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output - if ($strand eq '-') { - $meth_call = reverse $meth_call; - } - - ### Clipping off the first number of bases from the methylation call string as specified with --ignore - if ($ignore) { - # print "\n\n$meth_call\n"; - $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); - # print "$meth_call\n"; - - ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly - - my @len = split (/\D+/,$cigar); # storing the length per operation - my @ops = split (/\d+/,$cigar); # storing the operation - shift @ops; # remove the empty first element - die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); - - my @comp_cigar; # building an array with all CIGAR operations - foreach my $index (0..$#len) { - foreach (1..$len[$index]) { - # print "$ops[$index]"; - push @comp_cigar, $ops[$index]; - } - } - # print "original CIGAR: $cigar\n"; - # print "original CIGAR: @comp_cigar\n"; - - ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! - if ($strand eq '+') { - - my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions - my $I_count = 0; - - for (1..$ignore) { - my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start - # print "$_ deleted $op\n"; - - while ($op eq 'D') { # repeating this for deletions (D) - $D_count++; - $op = shift @comp_cigar; - # print "$_ deleted $op\n"; - } - if ($op eq 'I') { # adjusting the genomic position for insertions (I) - $I_count++; - } - } - $start += $ignore + $D_count - $I_count; - # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; - } elsif ($strand eq '-') { - - for (1..$ignore) { - my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array - while ($op eq 'D') { # repeating this for deletions (D) - $op = pop @comp_cigar; - } - } - - ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR - ### string to be able to work out the starting position of the read which is on the 3' end of the sequence - my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position - foreach (@comp_cigar) { - ++$MD_count if ($_ eq 'M' or $_ eq 'D'); - } - $start += $MD_count - 1; - } - - ### reconstituting shortened CIGAR string - my $new_cigar; - my $count = 0; - my $last_op; - # print "ignore adjusted: @comp_cigar\n"; - foreach my $op (@comp_cigar) { - unless (defined $last_op){ - $last_op = $op; - ++$count; - next; - } - if ($last_op eq $op) { - ++$count; - } else { - $new_cigar .= "$count$last_op"; - $last_op = $op; - $count = 1; - } - } - $new_cigar .= "$count$last_op"; # appending the last operation and count - $cigar = $new_cigar; - # print "ignore adjusted scalar: $cigar\n"; - } - } - ### printing out the methylation state of every C in the read - print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); - - ++$methylation_call_strings_processed; # 1 per single-end result - } - } - } - - ### PROCESSING PAIRED-END RESULT FILES - elsif ($paired) { - - ### proceeding differently now for SAM format or vanilla Bismark format files - if ($vanilla) { # old vanilla Bismark paired-end output format - while () { - ++$line_count; - warn "processed line: $line_count\n" if ($line_count%500000==0); - - ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) - my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; - - my $index; - chomp $genome_conversion; - - if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { - $index = 0; ## this is OT - } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { - $index = 2; ## this is CTOB!!! - } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { - $index = 1; ## this is CTOT!!! - } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { - $index = 3; ## this is OB - } else { - die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; - } - - if ($meth_call_1 and $meth_call_2) { - ### Clipping off the first number of bases from the methylation call strings as specified with '--ignore ' - - if ($ignore) { - $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); - - ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified - $start_read_1 += $ignore; - } - if ($ignore_r2) { - $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); - - ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified - $end_read_2 -= $ignore_r2; - } - - my $end_read_1; - my $start_read_2; - - if ($strand eq '+') { - - $end_read_1 = $start_read_1+length($meth_call_1)-1; - $start_read_2 = $end_read_2-length($meth_call_2)+1; - - ## we first pass the first read which is in + orientation on the forward strand - print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity - - # we next pass the second read which is in - orientation on the reverse strand - ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 - print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2); - } - else { - - $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! - $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! - - ## we first pass the first read which is in - orientation on the reverse strand - print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1); - - # we next pass the second read which is in + orientation on the forward strand - ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 - print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2); - } - - $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls - } - } - } - else { # Bismark paired-end SAM output format (default) - while () { - ### SAM format can either start with header lines (starting with @) or start with alignments directly - if (/^\@/) { # skipping header lines (starting with @) - warn "skipping SAM header line:\t$_"; - next; - } - - ++$line_count; - warn "Processed lines: $line_count\n" if ($line_count%500000==0); - - # example paired-end reads in SAM format (2 consecutive lines) - # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT - # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT - - # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; - - my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression - my $meth_call_1; - my $first_read_conversion; - my $genome_conversion; - - while ( /(XM|XR|XG):Z:([^\t]+)/g ) { - my $tag = $1; - my $value = $2; - - if ($tag eq "XM") { - $meth_call_1 = $value; - $meth_call_1 =~ s/\r//; - } elsif ($tag eq "XR") { - $first_read_conversion = $value; - $first_read_conversion =~ s/\r//; - } elsif ($tag eq "XG") { - $genome_conversion = $value; - $genome_conversion =~ s/\r//; - } - } - - $_ = ; # reading in the paired read - - # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; - # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://; - # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://; - # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://; - # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://; - - my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression - - my $meth_call_2; - my $second_read_conversion; - - while ( /(XM|XR):Z:([^\t]+)/g ) { - my $tag = $1; - my $value = $2; - - if ($tag eq "XM") { - $meth_call_2 = $value; - $meth_call_2 =~ s/\r//; - } elsif ($tag eq "XR") { - $second_read_conversion = $value; - $second_read_conversion = s/\r//; - } - } - - # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; - chomp $genome_conversion; # in case it captured a new line character - - # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; - - my $index; - my $strand; - - if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { - $index = 0; ## this is OT - $strand = '+'; - } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { - $index = 1; ## this is CTOT - $strand = '-'; - } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { - $index = 2; ## this is CTOB - $strand = '+'; - } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { - $index = 3; ## this is OB - $strand = '-'; - } else { - die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; - } - - ### reversing the methylation call of the read that was reverse-complemented - if ($strand eq '+') { - $meth_call_2 = reverse $meth_call_2; - } else { - $meth_call_1 = reverse $meth_call_1; - } - - if ($meth_call_1 and $meth_call_2) { - - my $end_read_1; - - ### READ 1 - my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation - my @ops_1 = split (/\d+/,$cigar_1); # storing the operation - shift @ops_1; # remove the empty first element - - die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1); - - my @comp_cigar_1; # building an array with all CIGAR operations - foreach my $index (0..$#len_1) { - foreach (1..$len_1[$index]) { - # print "$ops_1[$index]"; - push @comp_cigar_1, $ops_1[$index]; - } - } - # print "original CIGAR read 1: $cigar_1\n"; - # print "original CIGAR read 1: @comp_cigar_1\n"; - - ### READ 2 - my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation - my @ops_2 = split (/\d+/,$cigar_2); # storing the operation - shift @ops_2; # remove the empty first element - die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); - my @comp_cigar_2; # building an array with all CIGAR operations for read 2 - foreach my $index (0..$#len_2) { - foreach (1..$len_2[$index]) { - # print "$ops_2[$index]"; - push @comp_cigar_2, $ops_2[$index]; - } - } - # print "original CIGAR read 2: $cigar_2\n"; - # print "original CIGAR read 2: @comp_cigar_2\n"; - - - - if ($ignore) { - ### Clipping off the first number of bases from the methylation call strings as specified with '--ignore ' for read 1 - ### the methylation calls have already been reversed where necessary - $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); - - if ($strand eq '+') { - - ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start - my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions - my $I_count_1 = 0; - - for (1..$ignore) { - my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start - # print "$_ deleted $op\n"; - - while ($op eq 'D') { # repeating this for deletions (D) - $D_count_1++; - $op = shift @comp_cigar_1; - # print "$_ deleted $op\n"; - } - if ($op eq 'I') { # adjusting the genomic position for insertions (I) - $I_count_1++; - } - } - - $start_read_1 += $ignore + $D_count_1 - $I_count_1; - # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; - - # the start position of reads mapping to the reverse strand is being adjusted further below - } - elsif ($strand eq '-') { - - ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back - for (1..$ignore) { - my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array - while ($op eq 'D') { # repeating this for deletions (D) - $op = pop @comp_cigar_1; - } - } - # the start position of reads mapping to the reverse strand is being adjusted further below - - } - } - - if ($ignore_r2) { - ### Clipping off the first number of bases from the methylation call string as specified with '--ignore_r2 ' for read 2 - ### the methylation calls have already been reversed where necessary - $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); - - ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly - - if ($strand eq '+') { - - ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back - - for (1..$ignore_r2) { - my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array - while ($op eq 'D') { # repeating this for deletions (D) - $op = pop @comp_cigar_2; - } - } - # the start position of reads mapping to the reverse strand is being adjusted further below - } - elsif ($strand eq '-') { - - ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start - my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions - my $I_count_2 = 0; - - for (1..$ignore_r2) { - my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start - # print "$_ deleted $op\n"; - - while ($op eq 'D') { # repeating this for deletions (D) - $D_count_2++; - $op = shift @comp_cigar_2; - # print "$_ deleted $op\n"; - } - if ($op eq 'I') { # adjusting the genomic position for insertions (I) - $I_count_2++; - } - } - - $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2; - # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; - } - } - - if ($ignore){ - ### reconstituting shortened CIGAR string 1 - my $new_cigar_1; - my $count_1 = 0; - my $last_op_1; - # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; - foreach my $op (@comp_cigar_1) { - unless (defined $last_op_1){ - $last_op_1 = $op; - ++$count_1; - next; - } - if ($last_op_1 eq $op) { - ++$count_1; - } else { - $new_cigar_1 .= "$count_1$last_op_1"; - $last_op_1 = $op; - $count_1 = 1; - } - } - $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count - $cigar_1 = $new_cigar_1; - # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; - } - - if ($ignore_r2){ - - ### reconstituting shortened CIGAR string 2 - my $new_cigar_2; - my $count_2 = 0; - my $last_op_2; - # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; - foreach my $op (@comp_cigar_2) { - unless (defined $last_op_2){ - $last_op_2 = $op; - ++$count_2; - next; - } - if ($last_op_2 eq $op) { - ++$count_2; - } - else { - $new_cigar_2 .= "$count_2$last_op_2"; - $last_op_2 = $op; - $count_2 = 1; - } - } - $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count - $cigar_2 = $new_cigar_2; - # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; - } - - ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on - - if ($strand eq '+') { - ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 - @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too - # print "reverse: @comp_cigar_2\n"; - - my $MD_count_1 = 0; - foreach (@comp_cigar_1) { - ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't - } - - my $MD_count_2 = 0; - foreach (@comp_cigar_2) { - ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't - } - - $end_read_1 = $start_read_1 + $MD_count_1 - 1; - $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand - } - else { - ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 - - @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too - # print "reverse: @comp_cigar_1\n"; - - my $MD_count_1 = 0; - foreach (@comp_cigar_1) { - ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't - } - - $end_read_1 = $start_read_1; - $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand - } - - if ($strand eq '+') { - ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity - print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1); - - # we next pass the second read which is in - orientation on the reverse strand - ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 - print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2); - } else { - ## we first pass the first read which is in - orientation on the reverse strand - print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1); - - # we next pass the second read which is in + orientation on the forward strand - ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 - print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2); - } - - $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls - } - } - } - } else { - die "Single-end or paired-end reads not specified properly\n"; - } - - warn "\n\nProcessed $line_count lines from $filename in total\n"; - warn "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; - if ($report) { - print REPORT "\n\nProcessed $line_count lines from $filename in total\n"; - print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; - } - print_splitting_report (); -} - - - -sub print_splitting_report{ - - ### Calculating methylation percentages if applicable - - my $percent_meCpG; - if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ - $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); - } - - my $percent_meCHG; - if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ - $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); - } - - my $percent_meCHH; - if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ - $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); - } - - my $percent_non_CpG_methylation; - if ($merge_non_CpG){ - if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ - $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); - } - } - - if ($report){ - ### detailed information about Cs analysed - print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; - - my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; - print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; - - print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; - print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; - - print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; - - ### calculating methylated CpG percentage if applicable - if ($percent_meCpG){ - print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; - } - else{ - print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; - } - - ### 2-Context Output - if ($merge_non_CpG){ - if ($percent_non_CpG_methylation){ - print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; - } - else{ - print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; - } - } - - ### 3 Context Output - else{ - ### calculating methylated CHG percentage if applicable - if ($percent_meCHG){ - print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; - } - else{ - print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; - } - - ### calculating methylated CHH percentage if applicable - if ($percent_meCHH){ - print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; - } - else{ - print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; - } - } - } - - ### detailed information about Cs analysed for on-screen report - print "Final Cytosine Methylation Report\n",'='x33,"\n"; - - my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; - print "Total number of C's analysed:\t$total_number_of_C\n\n"; - - print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; - print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; - - print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; - - ### printing methylated CpG percentage if applicable - if ($percent_meCpG){ - print "C methylated in CpG context:\t${percent_meCpG}%\n"; - } - else{ - print "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; - } - - ### 2-Context Output - if ($merge_non_CpG){ - if ($percent_non_CpG_methylation){ - print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; - } - else{ - print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; - } - } - - ### 3-Context Output - else{ - ### printing methylated CHG percentage if applicable - if ($percent_meCHG){ - print "C methylated in CHG context:\t${percent_meCHG}%\n"; - } - else{ - print "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; - } - - ### printing methylated CHH percentage if applicable - if ($percent_meCHH){ - print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; - } - else{ - print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; - } - } -} - - - -sub print_individual_C_methylation_states_paired_end_files{ - - my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_; - - ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2 - die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2); - - my @methylation_calls = split(//,$meth_call); - - ################################################################# - ### . for bases not involving cytosines ### - ### X for methylated C in CHG context (was protected) ### - ### x for not methylated C in CHG context (was converted) ### - ### H for methylated C in CHH context (was protected) ### - ### h for not methylated C in CHH context (was converted) ### - ### Z for methylated C in CpG context (was protected) ### - ### z for not methylated C in CpG context (was converted) ### - ### U for methylated C in Unknown context (was protected) ### - ### u for not methylated C in Unknown context (was converted) ### - ################################################################# - - my $methyl_CHG_count = 0; - my $methyl_CHH_count = 0; - my $methyl_CpG_count = 0; - my $unmethylated_CHG_count = 0; - my $unmethylated_CHH_count = 0; - my $unmethylated_CpG_count = 0; - - my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions - my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels - my @comp_cigar; - - ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing - if ($cigar =~ /^\d+M$/){ - # this check speeds up the extraction process by up to 60%!!! - } - else{ # parsing CIGAR string - my @len; - my @ops; - @len = split (/\D+/,$cigar); # storing the length per operation - @ops = split (/\d+/,$cigar); # storing the operation - shift @ops; # remove the empty first element - - die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); - - foreach my $index (0..$#len){ - foreach (1..$len[$index]){ - # print "$ops[$index]"; - push @comp_cigar, $ops[$index]; - } - } - # warn "\nDetected CIGAR string: $cigar\n"; - # warn "Length of methylation call: ",length $meth_call,"\n"; - # warn "number of operations: ",scalar @ops,"\n"; - # warn "number of length digits: ",scalar @len,"\n\n"; - # print @comp_cigar,"\n"; - # print "$meth_call\n\n"; - # sleep (1); - } - - if ($strand eq '-') { - - ### the CIGAR string needs to be reversed, the methylation call has already been reversed above - if (@comp_cigar){ - @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too - } - # print "reverse CIGAR string: @comp_cigar\n"; - - ### the start position of paired-end files has already been corrected, see above - } - - ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified - - if ($merge_non_CpG) { - if ($no_overlap) { # this has to be read 2... - - ### single-file CpG and non-CpG context output - if ($full) { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - ### Returning as soon as the methylation calls start overlapping - if ($start+$index+$pos_offset >= $end_read_1) { - return; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.'){} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); - } - } - } - elsif ($strand eq '-') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - ### Returning as soon as the methylation calls start overlapping - if ($start-$index+$pos_offset <= $end_read_1) { - return; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); - } - } - } else { - die "The read orientation was neither + nor -: '$strand'\n"; - } - } - - ### strand-specific methylation output - else { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - ### Returning as soon as the methylation calls start overlapping - if ($start+$index+$pos_offset >= $end_read_1) { - return; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } elsif ($strand eq '-') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - ### Returning as soon as the methylation calls start overlapping - if ($start-$index+$pos_offset <= $end_read_1) { - return; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } else { - die "The strand orientation was neither + nor -: '$strand'/n"; - } - } - } - - ### this is the default paired-end procedure allowing overlaps and using every single C position - ### Still within the 2-CONTEXT ONLY optional section - else { - ### single-file CpG and non-CpG context output - if ($full) { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); - } - } - } elsif ($strand eq '-') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); - } - } - } else { - die "The strand orientation as neither + nor -: '$strand'\n"; - } - } - - ### strand-specific methylation output - ### still within the 2-CONTEXT optional section - else { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } elsif ($strand eq '-') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } else { - die "The strand orientation as neither + nor -: '$strand'\n"; - } - } - } - } - - ############################################ - ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### - ############################################ - - elsif ($no_overlap) { - ### single-file CpG, CHG and CHH context output - if ($full) { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - ### Returning as soon as the methylation calls start overlapping - if ($start+$index+$pos_offset >= $end_read_1) { - return; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } elsif ($strand eq '-') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - ### Returning as soon as the methylation calls start overlapping - if ($start-$index+$pos_offset <= $end_read_1) { - return; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } else { - die "The strand orientation as neither + nor -: '$strand'\n"; - } - } - - ### strand-specific methylation output - else { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - ### Returning as soon as the methylation calls start overlapping - if ($start+$index+$pos_offset >= $end_read_1) { - return; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } elsif ($strand eq '-') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - ### Returning as soon as the methylation calls start overlapping - if ($start-$index+$pos_offset <= $end_read_1) { - return; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } else { - die "The strand orientation as neither + nor -: '$strand'\n"; - } - } - } - - ### this is the default paired-end procedure allowing overlaps and using every single C position - else { - ### single-file CpG, CHG and CHH context output - if ($full) { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } elsif ($strand eq '-') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } else { - die "The strand orientation as neither + nor -: '$strand'\n"; - } - } - - ### strand-specific methylation output - else { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } elsif ($strand eq '-') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CpG}->{$index+1}->{un}++; - } - else{ - $mbias_2{CpG}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{meth}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{meth}++; - } - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - if ($read_identity == 1){ - $mbias_1{CHH}->{$index+1}->{un}++; - } - else{ - $mbias_2{CHH}->{$index+1}->{un}++; - } - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } else { - die "The strand orientation as neither + nor -: '$strand'\n"; - } - } - } -} - -sub check_cigar_string { - my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; - # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; - my ($new_cigar_offset,$new_pos_offset) = (0,0); - - if ($strand eq '+') { - # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; - - if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position - # warn "position needs no adjustment\n"; - } - - elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence - $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position - # warn "adjusted genomic position by -1 bp (insertion)\n"; - } - - elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence - $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index - $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position - # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; - - while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ - if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position - # warn "position needs no adjustment\n"; - last; - } - elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ - $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position - # warn "adjusted genomic position by another -1 bp (insertion)\n"; - last; - } - elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence - $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index - $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position - # warn "adjusted genomic position by another +1 bp (deletion)\n"; - } - else{ - die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; - } - } - } - else{ - die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; - } - } - - elsif ($strand eq '-') { - # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; - - if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position - # warn "position needs no adjustment\n"; - } - - elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence - $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position - # warn "adjusted genomic position by +1 bp (insertion)\n"; - } - - elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence - $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index - $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position - # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; - - while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ - if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position - # warn "Found new 'M' operation; position needs no adjustment\n"; - last; - } - elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ - $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position - # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; - last; - } - elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence - $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index - $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position - # warn "adjusted genomic position by another -1 bp (deletion)\n"; - } - else{ - die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; - } - } - } - else{ - die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; - } - } - # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; - return ($new_cigar_offset,$new_pos_offset); -} - -sub print_individual_C_methylation_states_single_end{ - - my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; - my @methylation_calls = split(//,$meth_call); - - ################################################################# - ### . for bases not involving cytosines ### - ### X for methylated C in CHG context (was protected) ### - ### x for not methylated C in CHG context (was converted) ### - ### H for methylated C in CHH context (was protected) ### - ### h for not methylated C in CHH context (was converted) ### - ### Z for methylated C in CpG context (was protected) ### - ### z for not methylated C in CpG context (was converted) ### - ################################################################# - - my $methyl_CHG_count = 0; - my $methyl_CHH_count = 0; - my $methyl_CpG_count = 0; - my $unmethylated_CHG_count = 0; - my $unmethylated_CHH_count = 0; - my $unmethylated_CpG_count = 0; - - my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions - my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels - - my @comp_cigar; - - if ($cigar){ # parsing CIGAR string - - ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing - if ($cigar =~ /^\d+M$/){ - # warn "See!? I told you so! $cigar\n"; - # sleep(1); - } - else{ - - my @len; - my @ops; - - @len = split (/\D+/,$cigar); # storing the length per operation - @ops = split (/\d+/,$cigar); # storing the operation - shift @ops; # remove the empty first element - # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops); - die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); - - foreach my $index (0..$#len){ - foreach (1..$len[$index]){ - # print "$ops[$index]"; - push @comp_cigar, $ops[$index]; - } - } - } - # warn "\nDetected CIGAR string: $cigar\n"; - # warn "Length of methylation call: ",length $meth_call,"\n"; - # warn "number of operations: ",scalar @ops,"\n"; - # warn "number of length digits: ",scalar @len,"\n\n"; - # print @comp_cigar,"\n"; - # print "$meth_call\n\n"; - # sleep (1); - } - - ### adjusting the start position for all reads mapping to the reverse strand - if ($strand eq '-') { - - if (@comp_cigar){ # only needed for SAM reads with InDels - @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too - # print @comp_cigar,"\n"; - } - - unless ($ignore){ ### if --ignore was specified the start position has already been corrected - - if ($cigar){ ### SAM format - if ($cigar =~ /^(\d+)M$/){ # linear match - $start += $1 - 1; - } - else{ # InDel read - my $MD_count = 0; - foreach (@comp_cigar){ - ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't - } - $start += $MD_count - 1; - } - } - else{ ### vanilla format - $start += length($meth_call)-1; - } - } - } - - ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) - - ### single-file CpG and other-context output - if ($full and $merge_non_CpG) { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - - if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - ### methylated Cs (any context) will receive a forward (+) orientation - ### not methylated Cs (any context) will receive a reverse (-) orientation - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } - elsif ($strand eq '-') { - - for my $index (0..$#methylation_calls) { - ### methylated Cs (any context) will receive a forward (+) orientation - ### not methylated Cs (any context) will receive a reverse (-) orientation - - if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels - # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq '.'){} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } - else { - die "The strand information was neither + nor -: $strand\n"; - } - } - - ### strand-specific methylation output - elsif ($merge_non_CpG) { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - ### methylated Cs (any context) will receive a forward (+) orientation - ### not methylated Cs (any context) will receive a reverse (-) orientation - - if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } - elsif ($strand eq '-') { - - for my $index (0..$#methylation_calls) { - ### methylated Cs (any context) will receive a forward (+) orientation - ### not methylated Cs (any context) will receive a reverse (-) orientation - - if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } - else { - die "The strand information was neither + nor -: $strand\n"; - } - } - - ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION - - elsif ($full) { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - ### methylated Cs (any context) will receive a forward (+) orientation - ### not methylated Cs (any context) will receive a reverse (-) orientation - - if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); - } - } - } - elsif ($strand eq '-') { - - for my $index (0..$#methylation_calls) { - ### methylated Cs (any context) will receive a forward (+) orientation - ### not methylated Cs (any context) will receive a reverse (-) orientation - - if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } - else { - die "The read had a strand orientation which was neither + nor -: $strand\n"; - } - } - - ### strand-specific methylation output - else { - if ($strand eq '+') { - for my $index (0..$#methylation_calls) { - ### methylated Cs (any context) will receive a forward (+) orientation - ### not methylated Cs (any context) will receive a reverse (-) orientation - - if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } - elsif ($strand eq '-') { - - for my $index (0..$#methylation_calls) { - ### methylated Cs (any context) will receive a forward (+) orientation - ### not methylated Cs (any context) will receive a reverse (-) orientation - - if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels - my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); - $cigar_offset += $cigar_mod; - $pos_offset += $pos_mod; - } - - if ($methylation_calls[$index] eq 'X') { - $counting{total_meCHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'x') { - $counting{total_unmethylated_CHG_count}++; - print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'Z') { - $counting{total_meCpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'z') { - $counting{total_unmethylated_CpG_count}++; - print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CpG}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq 'H') { - $counting{total_meCHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{meth}++; - } - elsif ($methylation_calls[$index] eq 'h') { - $counting{total_unmethylated_CHH_count}++; - print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); - $mbias_1{CHH}->{$index+1}->{un}++; - } - elsif ($methylation_calls[$index] eq '.') {} - elsif (lc$methylation_calls[$index] eq 'u'){} - else{ - die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; - } - } - } - else { - die "The strand information was neither + nor -: $strand\n"; - } - } -} - - - -sub print_helpfile{ - - print << 'HOW_TO'; - - -DESCRIPTION - -The following is a brief description of all options to control the Bismark -methylation extractor. The script reads in a bisulfite read alignment results file -produced by the Bismark bisulfite mapper and extracts the methylation information -for individual cytosines. This information is found in the methylation call field -which can contain the following characters: - - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ~~~ X for methylated C in CHG context ~~~ - ~~~ x for not methylated C CHG ~~~ - ~~~ H for methylated C in CHH context ~~~ - ~~~ h for not methylated C in CHH context ~~~ - ~~~ Z for methylated C in CpG context ~~~ - ~~~ z for not methylated C in CpG context ~~~ - ~~~ U for methylated C in Unknown context (CN or CHN ~~~ - ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~ - ~~~ . for any bases not involving cytosines ~~~ - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The methylation extractor outputs result files for cytosines in CpG, CHG and CHH -context (this distinction is actually already made in Bismark itself). As the methylation -information for every C analysed can produce files which easily have tens or even hundreds of -millions of lines, file sizes can become very large and more difficult to handle. The C -methylation info additionally splits cytosine methylation calls up into one of the four possible -strands a given bisulfite read aligned against: - - OT original top strand - CTOT complementary to original top strand - - OB original bottom strand - CTOB complementary to original bottom strand - -Thus, by default twelve individual output files are being generated per input file (unless ---comprehensive is specified, see below). The output files can be imported into a genome -viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact -unless the bisulfite reads were generated preserving directionality it doesn't make any -sense to look at the data in a strand-specific manner). Strand-specific output files can -optionally be skipped, in which case only three output files for CpG, CHG or CHH context -will be generated. For both the strand-specific and comprehensive outputs there is also -the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. - - -The output files are in the following format (tab delimited): - - - - -USAGE: methylation_extractor [options] - - -ARGUMENTS: -========== - - A space-separated list of Bismark result files in SAM format from - which methylation information is extracted for every cytosine in - the reads. For alignment files in the older custom Bismark output - see option '--vanilla'. - -OPTIONS: - --s/--single-end Input file(s) are Bismark result file(s) generated from single-end - read data. Specifying either --single-end or --paired-end is - mandatory. - --p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end - read data. Specifying either --paired-end or --single-end is - mandatory. - ---vanilla The Bismark result input file(s) are in the old custom Bismark format - (up to version 0.5.x) and not in SAM format which is the default as - of Bismark version 0.6.x or higher. Default: OFF. - ---no_overlap For paired-end reads it is theoretically possible that read_1 and - read_2 overlap. This option avoids scoring overlapping methylation - calls twice (only methylation calls of read 1 are used for in the process - since read 1 has historically higher quality basecalls than read 2). - Whilst this option removes a bias towards more methylation calls - in the center of sequenced fragments it may de facto remove a sizable - proportion of the data. This option is highly recommended for paired-end - data. - ---ignore Ignore the first bp from the 5' end of Read 1 when processing the - methylation call string. This can remove e.g. a restriction enzyme site - at the start of each read or any other source of bias (e.g. PBAT-Seq data). - ---ignore_r2 Ignore the first bp from the 5' end of Read 2 of paired-end sequencing - results only. Since the first couple of bases in Read 2 of BS-Seq experiments - show a severe bias towards non-methylation as a result of end-repairing - sonicated fragments with unmethylated cytosines (see M-bias plot), it is - recommended that the first couple of bp of Read 2 are removed before - starting downstream analysis. Please see the section on M-bias plots in the - Bismark User Guide for more details. - ---comprehensive Specifying this option will merge all four possible strand-specific - methylation info into context-dependent output files. The default - - contexts are: - - CpG context - - CHG context - - CHH context - ---merge_non_CpG This will produce two output files (in --comprehensive mode) or eight - strand-specific output files (default) for Cs in - - CpG context - - non-CpG context - ---report Prints out a short methylation summary as well as the paramaters used to run - this script. - ---no_header Suppresses the Bismark version header line in all output files for more convenient - batch processing. - --o/--output DIR Allows specification of a different output directory (absolute or relative - path). If not specified explicitely, the output will be written to the current directory. - ---samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified - explicitly if Samtools is in the PATH already. - ---gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in - a GZIP compressed form to save disk space. This option does not work on bedGraph and - genome-wide cytosine reports as they are 'tiny' anyway. - ---version Displays version information. - --h/--help Displays this help file and exits. - ---mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as - well as a report (optional) and then quit. Default: OFF. - - - -bedGraph specific options: -========================== - ---bedGraph After finishing the methylation extraction, the methylation output is written into a - sorted bedGraph file that reports the position of a given cytosine and its methylation - state (in %, see details below). The methylation extractor output is temporarily split up into - temporary files, one per chromosome (written into the current directory or folder - specified with -o/--output); these temp files are then used for sorting and deleted - afterwards. By default, only cytosines in CpG context will be sorted. The option - '--CX_context' may be used to report all cytosines irrespective of sequence context - (this will take MUCH longer!). The default folder for temporary files during the sorting - process is the output directory. The bedGraph conversion step is performed by the external - module 'bismark2bedGraph'; this script needs to reside in the same folder as the - bismark_methylation_extractor itself. - - ---cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide - before its methylation percentage is reported. Default: 1. - ---remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. - - ---CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered - in the experiment irrespective of its sequence context. This applies to both forward and - reverse strands. Please be aware that this option may generate large temporary and output files - and may take a long time to sort (up to many hours). Default: OFF. - (i.e. Default = CpG context only). - ---buffer_size This allows you to specify the main memory sort buffer when sorting the methylation information. - Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or - a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. - (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. - Defaults to 2G. - ---scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of - scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to - individual chromosome files. These errors were caused by the operating system's limit - of the number of filehandle that can be written to at any one time (typically 1024; to - find out this limit on Linux, type: ulimit -a). - To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort - methylation calls into individual chromosome files. Instead, all input files are - temporarily merged into a single file (unless there is only a single file), and this - file will then be sorted by both chromosome AND position using the Unix sort command. - Please be aware that this option might take a looooong time to complete, depending on - the size of the input files, and the memory you allocate to this process (see --buffer_size). - Nevertheless, it seems to be working. - ---ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will - instead use two arrays to sort methylated and unmethylated calls. This may result in a faster - sorting process of very large files, but this comes at the cost of a larger memory footprint - (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB - of RAM). Due to overheads in creating and looping through these arrays it seems that it will - actually be *slower* for small files (few million alignments), and we are currently testing at - which point it is advisable to use this option. Note that --ample_memory is not compatible - with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with). - - - -Genome-wide cytosine methylation report specific options: -========================================================= - ---cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a - genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based - chromosome coordinates (zero-based cords are optional) and reports CpG context only (all - cytosine context is optional). The output considers all Cs on both forward and reverse strands and - reports their position, strand, trinucleotide content and methylation state (counts are 0 if not - covered). The cytsoine report conversion step is performed by the external module - 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor - itself. - ---CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of - its context. This applies to both forward and reverse strands. Please be aware that this will - generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. - Default: OFF (i.e. Default = CpG context only). - ---zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF. - ---genome_folder Enter the genome folder you wish to use to extract sequences from (full path only). Accepted - formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. - ---split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files - will be named to include the input filename and the chromosome number. - - - -OUTPUT: - -The bismark_methylation_extractor output is in the form: -======================================================== - - -* Methylated cytosines receive a '+' orientation, -* Unmethylated cytosines receive a '-' orientation. - - - -The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords): -========================================================================================================= - -track type=bedGraph (header line) - - - - - -The coverage output looks like this (tab-delimited, 1-based genomic coords): -============================================================================ - - - - - -The genome-wide cytosine methylation output file is tab-delimited in the following format: -========================================================================================== - - - - -This script was last modified on 25 November 2013. - -HOW_TO -} diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_methylation_extractor.py --- a/bismark_methylation_extractor.py Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,197 +0,0 @@ -#!/usr/bin/env python - -import argparse, os, shutil, subprocess, sys, tempfile, fileinput -import zipfile -from glob import glob - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def zipper(dir, zip_file): - zip = zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED) - root_len = len(os.path.abspath(dir)) - for root, dirs, files in os.walk(dir): - archive_root = os.path.abspath(root)[root_len:] - for f in files: - fullpath = os.path.join(root, f) - archive_name = os.path.join(archive_root, f) - zip.write(fullpath, archive_name, zipfile.ZIP_DEFLATED) - zip.close() - return zip_file - -def __main__(): - #Parse Command Line - parser = argparse.ArgumentParser(description='Wrapper for the bismark methylation caller.') - - # input options - parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' ) - - parser.add_argument( '--infile', help='Input file in SAM or BAM format.' ) - parser.add_argument( '--single-end', dest='single_end', action="store_true" ) - parser.add_argument( '--paired-end', dest='paired_end', action="store_true" ) - - parser.add_argument( '--report-file', dest='report_file' ) - parser.add_argument( '--comprehensive', action="store_true" ) - parser.add_argument( '--merge-non-cpg', dest='merge_non_cpg', action="store_true" ) - parser.add_argument( '--no-overlap', dest='no_overlap', action="store_true" ) - parser.add_argument( '--compress' ) - parser.add_argument( '--ignore-bps', dest='ignore_bps', type=int ) - - # OT - original top strand - parser.add_argument( '--cpg_ot' ) - parser.add_argument( '--chg_ot' ) - parser.add_argument( '--chh_ot' ) - # CTOT - complementary to original top strand - parser.add_argument( '--cpg_ctot' ) - parser.add_argument( '--chg_ctot' ) - parser.add_argument( '--chh_ctot' ) - # OB - original bottom strand - parser.add_argument( '--cpg_ob' ) - parser.add_argument( '--chg_ob' ) - parser.add_argument( '--chh_ob' ) - # CTOT - complementary to original bottom strand - parser.add_argument( '--cpg_ctob' ) - parser.add_argument( '--chg_ctob' ) - parser.add_argument( '--chh_ctob' ) - - parser.add_argument( '--cpg_context' ) - parser.add_argument( '--chg_context' ) - parser.add_argument( '--chh_context' ) - - parser.add_argument( '--non_cpg_context' ) - - parser.add_argument( '--non_cpg_context_ot' ) - parser.add_argument( '--non_cpg_context_ctot' ) - parser.add_argument( '--non_cpg_context_ob' ) - parser.add_argument( '--non_cpg_context_ctob' ) - - args = parser.parse_args() - - - # Build methylation extractor command - output_dir = tempfile.mkdtemp() - cmd = 'bismark_methylation_extractor --no_header -o %s %s %s' - if args.bismark_path: - # add the path to the bismark perl scripts, that is needed for galaxy - cmd = os.path.join(args.bismark_path, cmd) - - additional_opts = '' - # Set up all options - if args.single_end: - additional_opts += ' --single-end ' - else: - additional_opts += ' --paired-end ' - if args.no_overlap: - additional_opts += ' --no_overlap ' - if args.ignore_bps: - additional_opts += ' --ignore %s ' % args.ignore_bps - if args.comprehensive: - additional_opts += ' --comprehensive ' - if args.merge_non_cpg: - additional_opts += ' --merge_non_CpG ' - if args.report_file: - additional_opts += ' --report ' - - #detect BAM file, use samtools view if it is a bam file - f = open (args.infile, 'rb') - sig = f.read(4) - f.close() - if sig == '\x1f\x8b\x08\x04' : - cmd = cmd % (output_dir, additional_opts, '-') - cmd = 'samtools view %s | %s' % (args.infile, cmd ) - else : - cmd = cmd % (output_dir, additional_opts, args.infile) - - # Run - try: - tmp_out = tempfile.NamedTemporaryFile().name - tmp_stdout = open( tmp_out, 'wb' ) - tmp_err = tempfile.NamedTemporaryFile().name - tmp_stderr = open( tmp_err, 'wb' ) - proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) - returncode = proc.wait() - tmp_stderr.close() - # get stderr, allowing for case where it's very large - tmp_stderr = open( tmp_err, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stdout.close() - tmp_stderr.close() - if returncode != 0: - raise Exception, stderr - - # TODO: look for errors in program output. - except Exception, e: - stop_err( 'Error in bismark methylation extractor:\n' + str( e ) ) - - - # collect and copy output files - - if args.compress: - zipper(output_dir, args.compress) - - - if args.cpg_ot: - shutil.move( glob(os.path.join( output_dir, '*CpG_OT_*'))[0], args.cpg_ot ) - if args.chg_ot: - shutil.move( glob(os.path.join( output_dir, '*CHG_OT_*'))[0], args.chg_ot ) - if args.chh_ot: - shutil.move( glob(os.path.join( output_dir, '*CHH_OT_*'))[0], args.chh_ot ) - if args.cpg_ctot: - shutil.move( glob(os.path.join( output_dir, '*CpG_CTOT_*'))[0], args.cpg_ctot ) - if args.chg_ctot: - shutil.move( glob(os.path.join( output_dir, '*CHG_CTOT_*'))[0], args.chg_ctot ) - if args.chh_ctot: - shutil.move( glob(os.path.join( output_dir, '*CHH_CTOT_*'))[0], args.chh_ctot ) - if args.cpg_ob: - shutil.move( glob(os.path.join( output_dir, '*CpG_OB_*'))[0], args.cpg_ob ) - if args.chg_ob: - shutil.move( glob(os.path.join( output_dir, '*CHG_OB_*'))[0], args.chg_ob ) - if args.chh_ob: - shutil.move( glob(os.path.join( output_dir, '*CHH_OB_*'))[0], args.chh_ob ) - if args.cpg_ctob: - shutil.move( glob(os.path.join( output_dir, '*CpG_CTOB_*'))[0], args.cpg_ctob ) - if args.chg_ctob: - shutil.move( glob(os.path.join( output_dir, '*CHG_CTOB_*'))[0], args.chg_ctob ) - if args.chh_ctob: - shutil.move( glob(os.path.join( output_dir, '*CHH_CTOB_*'))[0], args.chh_ctob ) - - # context-dependent methylation output files - if args.cpg_context: - shutil.move( glob(os.path.join( output_dir, '*CpG_context_*'))[0], args.cpg_context ) - if args.chg_context: - shutil.move( glob(os.path.join( output_dir, '*CHG_context_*'))[0], args.chg_context ) - if args.chh_context: - shutil.move( glob(os.path.join( output_dir, '*CHH_context_*'))[0], args.chh_context ) - - if args.non_cpg_context: - shutil.move( glob(os.path.join( output_dir, '*Non_CpG_context_*'))[0], args.non_cpg_context ) - - if args.non_cpg_context_ot: - shutil.move( glob(os.path.join( output_dir, '*Non_CpG_OT_*'))[0], args.non_cpg_context_ot ) - if args.non_cpg_context_ctot: - shutil.move( glob(os.path.join( output_dir, '*Non_CpG_CTOT_*'))[0], args.non_cpg_context_ctot ) - if args.non_cpg_context_ob: - shutil.move( glob(os.path.join( output_dir, '*Non_CpG_OB_*'))[0], args.non_cpg_context_ob ) - if args.non_cpg_context_ctob: - shutil.move( glob(os.path.join( output_dir, '*Non_CpG_CTOB_*'))[0], args.non_cpg_context_ctob ) - - - - if args.report_file: - shutil.move( glob(os.path.join( output_dir, '*_splitting_report*'))[0], args.report_file ) - - - # Clean up temp dirs - if os.path.exists( output_dir ): - shutil.rmtree( output_dir ) - -if __name__=="__main__": __main__() diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_methylation_extractor.xml --- a/bismark_methylation_extractor.xml Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,309 +0,0 @@ - - - Reports on methylation status of reads mapped by Bismark - - - SCRIPT_PATH - bowtie - bowtie2 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ( report is True ) - - - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - ( compress == False and comprehensive == False and merge_non_CpG == False) - - - - - ( compress == False and comprehensive) - - - ( compress == False and comprehensive and merge_non_CpG == False) - - - ( compress == False and comprehensive and merge_non_CpG == False) - - - - ( compress == False and comprehensive and merge_non_cpg) - - - - ( compress == False and comprehensive == False and merge_non_cpg) - - - ( compress == False and comprehensive == False and merge_non_cpg) - - - ( compress == False and comprehensive == False and merge_non_cpg) - - - ( compress == False and comprehensive == False and merge_non_cpg) - - - - ( compress ) - - - - - - - - - - diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_pretty_report/bismark2report --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_pretty_report/bismark2report Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,1237 @@ +#!/usr/bin/env perl +use warnings; +use strict; +use Getopt::Long; +use FindBin qw($Bin); +use lib "$Bin/../lib"; + +## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +my $bismark2report_version = 'v0.16.3'; +my (@alignment_reports,@dedup_reports,@splitting_reports,@mbias_reports,@nuc_reports); + +my ($output_dir,$verbose,$manual_output_file) = process_commandline(); + +# print join (",",@alignment_reports)."\n"; +# print join (",",@dedup_reports)."\n"; +# print join (",",@splitting_reports)."\n"; +# print join (",",@mbias_reports)."\n"; +# print join (",",@nuc_reports)."\n"; + +while (@alignment_reports){ + + my $alignment_report = shift @alignment_reports; + my $dedup_report = shift @dedup_reports; + my $splitting_report = shift @splitting_reports; + my $mbias_report = shift @mbias_reports; + my $nuc_report = shift @nuc_reports; + + ### HTML OUTPUT FILE + my $report_output = $alignment_report; + $report_output =~ s/^.*\///; # deleting optional path information + $report_output =~ s/\.txt$//; + $report_output =~ s/$/.html/; + + # if -o output_file was specified we are going to use that name preferentially. This may only happen if there is a single report in the folder, or if a single report has been specified manually + if ($manual_output_file){ + warn "A specific output filename was specified: $manual_output_file. Using that one instead of deriving the filename\n"; sleep(1); + $report_output = $manual_output_file; + } + + $report_output = $output_dir.$report_output; + warn "\nWriting Bismark HTML report to >> $report_output <<\n\n"; + + my $doc = read_report_template(); #reading and storing the entire report template + + # BISMARK ALIGNMENT REPORT (mandatory) + warn "="x110,"\n"; + warn "Using the following alignment report:\t\t> $alignment_report <\n"; + # DEDUPLICATION REPORT (optional) + if ($dedup_report){ + warn "Using the following deduplication report:\t> $dedup_report <\n"; + } + else{ + warn "No deduplication report file specified, skipping this step\n"; + } + + # SPLITTING REPORT (optional) + if ($splitting_report){ + warn "Using the following splitting report:\t\t> $splitting_report <\n"; + } + else{ + warn "No splitting report file specified, skipping this step\n"; + } + + # M-BIAS REPORT (optional) + if ($mbias_report){ + warn "Using the following M-bias report:\t\t> $mbias_report <\n"; + } + else{ + warn "No M-bias report file specified, skipping this step\n"; + } + + # NUCLEOTIDE COVERAGE REPORT (optional) + if ($nuc_report){ + warn "Using the following nucleotide coverage report:\t> $nuc_report <\n"; + } + else{ + warn "No nucleotide coverage report file specified, skipping this step\n"; + } + warn "="x110,"\n\n\n"; + $verbose and sleep(3); + + # creating timestamp + $doc = getLoggingTime($doc); + + $doc = read_alignment_report($alignment_report,$doc); # mandatory + + if ($dedup_report){ # optional + $doc = read_deduplication_report($dedup_report,$doc); + + # removing the delete tags in the html template + $doc =~ s/\{\{start_deletion_duplication\}\}//g; + $doc =~ s/\{\{end_deletion_duplication\}\}//g; + } + else{ + # removing the entire graph and table section for the deduplication part + warn "Removing the duplication section from the HTML report\n" if ($verbose); + $doc =~ s/(\{\{start_deletion_duplication.*?end_deletion_duplication\}\})//gs; # s includes newline chars; non-greedy + warn "Deleting the following content:\n$1\n\n" if ($verbose); + sleep(3) if ($verbose); + } + + if ($nuc_report){ # optional + $doc = read_nucleotide_coverage_report($nuc_report,$doc); + + # removing the delete tags in the html template + $doc =~ s/\{\{start_deletion_nucleotide_coverage\}\}//g; + $doc =~ s/\{\{end_deletion_nucleotide_coverage\}\}//g; + } + else{ + # removing the entire graph and table section for the nucleotide coverage part + warn "Removing the nucleotide coverage section from the HTML report\n" if ($verbose); + $doc =~ s/(\{\{start_deletion_nucleotide_coverage.*?end_deletion_nucleotide_coverage\}\})//gs; # s includes newline chars; non-greedy + warn "Deleting the following content:\n$1\n\n" if ($verbose); + sleep(3) if ($verbose); + } + + if ($splitting_report){ # optional + $doc = read_splitting_report($splitting_report,$doc); + + # removing the delete tags in the html template + $doc =~ s/\{\{start_deletion_splitting\}\}//g; + $doc =~ s/\{\{end_deletion_splitting\}\}//g; + } + else{ + # removing the entire graph and table section for the deduplication part + warn "Removing the splitting report section from the HTML report\n" if ($verbose); + $doc =~ s/(\{\{start_deletion_splitting.*?end_deletion_splitting\}\})//gs; # s includes newline chars; non-greedy + warn "Deleting the following content:\n$1\n\n" if ($verbose); + sleep(3) if ($verbose); + } + + if ($mbias_report){ # optional + (my $state, $doc) = read_mbias_report($mbias_report,$doc); + + # removing the delete tags in the html template + $doc =~ s/\{\{start_deletion_mbias\}\}//g; + $doc =~ s/\{\{end_deletion_mbias\}\}//g; + + warn "Reported read state: $state\n\n" if ($verbose); + # if the report was single-end we need to remove the second plot only + if ($state eq 'single'){ + $doc =~ s/(\{\{start_deletion_mbias_2.*?end_deletion_mbias_2\}\})//gs; # s includes newline chars; non-greedy + warn "Deleting the following content:\n$1\n\n" if ($verbose); + } + else{ + $doc =~ s/\{\{start_deletion_mbias_2\}\}//g; + $doc =~ s/\{\{end_deletion_mbias_2\}\}//g; + } + } + else{ + # removing the entire graph and table section for the deduplication part + $doc =~ s/(\{\{start_deletion_mbias.*?end_deletion_mbias\}\})//gs; # s includes newline chars; non-greedy + warn "Deleting the following content:\n$1\n\n" if ($verbose); + sleep(3) if ($verbose); + } + + write_out_report($report_output,$doc); + +} + +sub write_out_report{ + my ($report_output,$doc) = @_; + open (OUT,'>',$report_output) or die "Failed to write to output file $report_output: $!\n\n"; + print OUT $doc; +} + +sub getLoggingTime { + my $doc = shift; + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time); + + my $time = sprintf ("%02d:%02d", $hour,$min,$sec); + my $date = sprintf ("%04d-%02d-%02d", $year+1900,$mon+1,$mday); + warn "Using Time: $time, and date: $date\n\n" if ($verbose); + + $doc =~ s/\{\{date\}\}/$date/g; + $doc =~ s/\{\{time\}\}/$time/g; + + return $doc; +} + + +sub read_alignment_report{ + + my ($alignment_report,$doc) = @_; + + warn "Processing alignment report $alignment_report ...\n"; + open (ALN,$alignment_report) or die "Couldn't read from file $alignment_report: $!\n\n"; + + my $unique; + my $no_aln; + my $multiple; + my $no_genomic; + my $total_seqs; + my $bismark_version; + my $input_filename; + + my $unique_text; + my $no_aln_text; + my $multiple_text; + my $total_seq_text; + + my $total_C_count; + my ($meth_CpG,$meth_CHG,$meth_CHH,$meth_unknown); + my ($unmeth_CpG,$unmeth_CHG,$unmeth_CHH,$unmeth_unknown); + my ($perc_CpG,$perc_CHG,$perc_CHH,$perc_unknown); + + my $number_OT; + my $number_CTOT; + my $number_CTOB; + my $number_OB; + + while (){ + chomp; + + ### General Alignment stats + if ($_ =~ /^Sequence pairs analysed in total:/ ){ ## Paired-end + (undef,$total_seqs) = split /\t/; + print "Total paired seqs: >> $total_seqs <<\n" if ($verbose); + $total_seq_text = 'Sequence pairs analysed in total'; + } + elsif ($_ =~ /^Sequences analysed in total:/ ){ ## Single-end + (undef,$total_seqs) = split /\t/; + $total_seq_text = 'Sequences analysed in total'; + print "total single-end seqs >> $total_seqs <<\n" if ($verbose); + } + + elsif($_ =~ /^Bismark report for: (.*) \(version: (.*)\)/){ + $input_filename = $1; + $bismark_version = $2; + print "Input filename(s) >> $input_filename <<\n" if ($verbose); + print "Bismark version >> $bismark_version <<\n" if ($verbose); + } + + elsif($_ =~ /^Number of paired-end alignments with a unique best hit:/){ ## Paired-end + (undef,$unique) = split /\t/; + print "Unique PE>> $unique <<\n" if ($verbose);; + $unique_text = 'Paired-end alignments with a unique best hit'; + } + elsif($_ =~ /^Number of alignments with a unique best hit from/){ ## Single-end + (undef,$unique) = split /\t/; + print "Unique SE>> $unique <<\n" if ($verbose); + $unique_text = 'Single-end alignments with a unique best hit'; + } + + elsif($_ =~ /^Sequence pairs with no alignments under any condition:/){ ## Paired-end + (undef,$no_aln) = split /\t/; + print "No alignment PE >> $no_aln <<\n" if ($verbose); + $no_aln_text = 'Pairs without alignments under any condition'; + } + elsif($_ =~ /^Sequences with no alignments under any condition:/){ ## Single-end + (undef,$no_aln) = split /\t/; + print "No alignments SE>> $no_aln <<\n" if ($verbose); + $no_aln_text = 'Sequences without alignments under any condition'; + } + + elsif($_ =~ /^Sequence pairs did not map uniquely:/){ ## Paired-end + (undef,$multiple) = split /\t/; + print "Multiple alignments PE >> $multiple <<\n" if ($verbose); + $multiple_text = 'Pairs that did not map uniquely'; + } + elsif($_ =~ /^Sequences did not map uniquely:/){ ## Single-end + (undef,$multiple) = split /\t/; + print "Multiple alignments SE >> $multiple <<\n" if ($verbose); + $multiple_text = 'Sequences that did not map uniquely'; + } + + elsif($_ =~ /^Sequence pairs which were discarded because genomic sequence could not be extracted:/){ ## Paired-end + (undef,$no_genomic) = split /\t/; + print "No genomic sequence PE >> $no_genomic <<\n" if ($verbose); + } + elsif($_ =~ /^Sequences which were discarded because genomic sequence could not be extracted:/){ ## Single-end + (undef,$no_genomic) = split /\t/; + print "No genomic sequence SE>> $no_genomic <<\n" if ($verbose); + } + + ### Context Methylation + elsif($_ =~ /^Total number of C/ ){ + (undef,$total_C_count) = split /\t/; + print "Total number C >> $total_C_count <<\n" if ($verbose); + } + + elsif($_ =~ /^Total methylated C\'s in CpG context:/ ){ + (undef,$meth_CpG) = split /\t/; + print "meth CpG >> $meth_CpG <<\n" if ($verbose); + } + elsif($_ =~ /^Total methylated C\'s in CHG context:/ ){ + (undef,$meth_CHG) = split /\t/; + print "meth CHG >> $meth_CHG <<\n" if ($verbose); + } + elsif($_ =~ /^Total methylated C\'s in CHH context:/ ){ + (undef,$meth_CHH) = split /\t/; + print "meth CHH >> $meth_CHH <<\n" if ($verbose); + } + elsif($_ =~ /^Total methylated C\'s in Unknown context:/ ){ + (undef,$meth_unknown) = split /\t/; + print "meth Unknown >> $meth_unknown <<\n" if ($verbose); + } + + elsif($_ =~ /^Total unmethylated C\'s in CpG context:/ or $_ =~ /^Total C to T conversions in CpG context:/){ + (undef,$unmeth_CpG) = split /\t/; + print "unmeth CpG >> $unmeth_CpG <<\n" if ($verbose); + } + elsif($_ =~ /^Total unmethylated C\'s in CHG context:/ or $_ =~ /^Total C to T conversions in CHG context:/){ + (undef,$unmeth_CHG) = split /\t/; + print "unmeth CHG >> $unmeth_CHG <<\n" if ($verbose); + } + elsif($_ =~ /^Total unmethylated C\'s in CHH context:/ or $_ =~ /^Total C to T conversions in CHH context:/){ + (undef,$unmeth_CHH) = split /\t/; + print "unmeth CHH >> $unmeth_CHH <<\n"if ($verbose); + } + elsif($_ =~ /^Total unmethylated C\'s in Unknown context:/ or $_ =~ /^Total C to T conversions in Unknown context:/){ + (undef,$unmeth_unknown) = split /\t/; + print "unmeth Unknown >> $unmeth_unknown <<\n" if ($verbose); + } + + elsif($_ =~ /^C methylated in CpG context:/ ){ + (undef,$perc_CpG) = split /\t/; + $perc_CpG =~ s/%//; + print "percentage CpG >> $perc_CpG <<\n" if ($verbose); + } + elsif($_ =~ /^C methylated in CHG context:/ ){ + (undef,$perc_CHG) = split /\t/; + $perc_CHG =~ s/%//; + print "percentage CHG >> $perc_CHG <<\n" if ($verbose); + } + elsif($_ =~ /^C methylated in CHH context:/ ){ + (undef,$perc_CHH) = split /\t/; + $perc_CHH =~ s/%//; + print "percentage CHH >> $perc_CHH <<\n" if ($verbose); + } + elsif($_ =~ /^C methylated in Unknown context:/ ){ + (undef,$perc_unknown) = split /\t/; + $perc_unknown =~ s/%//; + print "percentage Unknown >> $perc_unknown <<\n" if ($verbose); + } + + + ### Strand Origin + + elsif($_ =~ /^CT\/GA\/CT:/ ){ ## Paired-end + (undef,$number_OT) = split /\t/; + print "Number OT PE>> $number_OT <<\n" if ($verbose); + } + elsif($_ =~ /^CT\/CT:/ ){ ## Single-end + (undef,$number_OT) = split /\t/; + print "Number OT SE>> $number_OT <<\n" if ($verbose); + } + + elsif($_ =~ /^GA\/CT\/CT:/ ){ ## Paired-end + (undef,$number_CTOT) = split /\t/; + print "Number CTOT PE >> $number_CTOT <<\n" if ($verbose); + } + elsif($_ =~ /^GA\/CT:/ ){ ## Single-end + (undef,$number_CTOT) = split /\t/; + print "Number CTOT SE >> $number_CTOT <<\n" if ($verbose); + } + + elsif($_ =~ /^GA\/CT\/GA:/ ){ ## Paired-end + (undef,$number_CTOB) = split /\t/; + print "Number CTOB PE >> $number_CTOB <<\n" if ($verbose); + } + elsif($_ =~ /^GA\/GA:/ ){ ## Single-end + (undef,$number_CTOB) = split /\t/; + print "Number CTOB SE >> $number_CTOB <<\n" if ($verbose); + } + + elsif($_ =~ /^CT\/GA\/GA:/ ){ ## Paired-end + (undef,$number_OB) = split /\t/; + print "Number OB PE >> $number_OB <<\n" if ($verbose); + } + elsif($_ =~ /^CT\/GA:/ ){ ## Single-end + (undef,$number_OB) = split /\t/; + print "Number OB SE >> $number_OB <<\n" if ($verbose); + } + + + } + + if (defined $unique and defined $no_aln and defined $multiple and defined $no_genomic and defined $total_seqs){ + warn "Got all necessary information, editing HTML report\n" if ($verbose); + + ### General Alignment Stats + $doc =~ s/\{\{unique_seqs\}\}/$unique/g; + $doc =~ s/\{\{unique_seqs_text\}\}/$unique_text/g; + + $doc =~ s/\{\{no_alignments\}\}/$no_aln/g; + $doc =~ s/\{\{no_alignments_text\}\}/$no_aln_text/g; + + $doc =~ s/\{\{multiple_alignments\}\}/$multiple/g; + $doc =~ s/\{\{multiple_alignments_text\}\}/$multiple_text/g; + + $doc =~ s/\{\{no_genomic\}\}/$no_genomic/g; + + $doc =~ s/\{\{total_sequences_alignments\}\}/$total_seqs/g; + $doc =~ s/\{\{sequences_analysed_in_total\}\}/$total_seq_text/g; + + $doc =~ s/\{\{filename\}\}/$input_filename/g; + $doc =~ s/\{\{bismark_version\}\}/$bismark_version/g; + + ### Strand Origin + $doc =~ s/\{\{number_OT\}\}/$number_OT/g; + $doc =~ s/\{\{number_CTOT\}\}/$number_CTOT/g; + $doc =~ s/\{\{number_CTOB\}\}/$number_CTOB/g; + $doc =~ s/\{\{number_OB\}\}/$number_OB/g; + + ### Context Methylation + $doc =~ s/\{\{total_C_count\}\}/$total_C_count/g; + + unless (defined $perc_CpG){ + $perc_CpG = 'N/A'; + } + unless (defined $perc_CHG){ + $perc_CHG = 'N/A'; + } + unless (defined $perc_CHH){ + $perc_CHH = 'N/A'; + } + unless (defined $perc_unknown){ + $perc_unknown = 'N/A'; + } + + ### Unknown sequence context, just for Bowtie 2 alignments + my $meth_unknown_inject; + my $unmeth_unknown_inject; + my $perc_unknown_inject; + + if (defined $meth_unknown){ # if one Unknown context file is present, so should the others + $meth_unknown_inject = " + Methylated C's in Unknown context + $meth_unknown + "; + $unmeth_unknown_inject = " + Unmethylated C's in Unknown context + $unmeth_unknown + "; + $perc_unknown_inject = " + Methylated C's in Unknown context + $perc_unknown% + "; + } + else{ + $meth_unknown_inject = $unmeth_unknown_inject = $perc_unknown_inject = ''; + } + + ### injecting this into the table + $doc =~ s/\{\{meth_unknown\}\}/$meth_unknown_inject/g; + $doc =~ s/\{\{unmeth_unknown\}\}/$unmeth_unknown_inject/g; + $doc =~ s/\{\{perc_unknown\}\}/$perc_unknown_inject/g; + + + $doc =~ s/\{\{meth_CpG\}\}/$meth_CpG/g; + $doc =~ s/\{\{meth_CHG\}\}/$meth_CHG/g; + $doc =~ s/\{\{meth_CHH\}\}/$meth_CHH/g; + + $doc =~ s/\{\{unmeth_CpG\}\}/$unmeth_CpG/g; + $doc =~ s/\{\{unmeth_CHG\}\}/$unmeth_CHG/g; + $doc =~ s/\{\{unmeth_CHH\}\}/$unmeth_CHH/g; + + $doc =~ s/\{\{perc_CpG\}\}/$perc_CpG/g; + $doc =~ s/\{\{perc_CHG\}\}/$perc_CHG/g; + $doc =~ s/\{\{perc_CHH\}\}/$perc_CHH/g; + + my ($perc_CpG_graph, $perc_CHG_graph,$perc_CHH_graph,$perc_unknown_graph); + + if ($perc_CpG eq 'N/A'){ + $perc_CpG_graph = 0; # values of 0 won't show in the graph and won't produce errors + } + else{ + $perc_CpG_graph = $perc_CpG; + } + + if ($perc_CHG eq 'N/A'){ + $perc_CHG_graph = 0; # values of 0 won't show in the graph and won't produce errors + } + else{ + $perc_CHG_graph = $perc_CHG; + } + + if ($perc_CHH eq 'N/A'){ + $perc_CHH_graph = 0; # values of 0 won't show in the graph and won't produce errors + } + else{ + $perc_CHH_graph = $perc_CHH; + } + + if ($perc_unknown eq 'N/A'){ + $perc_unknown_graph = 0; # values of 0 won't show in the graph and won't produce errors + } + else{ + $perc_unknown_graph = $perc_unknown; + } + + $doc =~ s/\{\{perc_CpG_graph\}\}/$perc_CpG_graph/g; + $doc =~ s/\{\{perc_CHG_graph\}\}/$perc_CHG_graph/g; + $doc =~ s/\{\{perc_CHH_graph\}\}/$perc_CHH_graph/g; + + } + else{ + warn "Am I missing something?\n\n"; + } + + warn "Complete\n\n"; + return $doc; +} + + +sub read_deduplication_report{ + + my ($dedup_report,$doc) = @_; + + warn "Processing deduplication report $dedup_report ...\n"; + open (DEDUP,$dedup_report) or die "Couldn't read from file $dedup_report: $!\n\n"; + + my $total_seqs; + my $dups; + my $diff_pos; + my $leftover; + + while (){ + chomp; + if ($_ =~ /^Total number of alignments/){ + (undef,$total_seqs) = split /\t/; + warn "Total number of seqs >> $total_seqs <<\n" if ($verbose); + } + elsif($_ =~ /^Total number duplicated/){ + (undef,$dups) = split /\t/; + $dups =~ s/\s.*//; # just need the number, not the percentage + warn "Duplicated >> $dups <<\n" if ($verbose); + } + elsif($_ =~ /^Duplicated alignments were found at/){ + (undef,$diff_pos) = split /\t/; + $diff_pos =~ s/\s.*//; # just need the number + warn "Different positions >> $diff_pos <<\n" if ($verbose); + } + elsif($_ =~ /^Total count of deduplicated leftover sequences: (\d+)/){ + $leftover = $1; + warn "Leftover seqs >> $leftover <<\n" if ($verbose); + } + } + + unless (defined $leftover){ + if (defined $dups and defined $total_seqs){ + $leftover = $total_seqs - $dups; + } + } + + # Checking if we got all we need + if (defined $dups and defined $total_seqs and defined $diff_pos and defined $leftover){ + # warn "Got all I need!\n\n"; + $doc =~ s/\{\{seqs_total_duplicates\}\}/$total_seqs/g; + $doc =~ s/\{\{unique_alignments_duplicates\}\}/$leftover/g; + $doc =~ s/\{\{duplicate_alignments_duplicates\}\}/$dups/g; + $doc =~ s/\{\{different_positions_duplicates\}\}/$diff_pos/g; + } + else{ + warn "Something went wrong... Use --verbose to get a clue...\n"; + # skipping this plot entirely if values could not be extracted + return $doc; + } + warn "Complete\n\n"; + return $doc; +} + + +sub read_nucleotide_coverage_report{ + + my ($nuc_report,$doc) = @_; + + warn "Processing nucleotide coverage report '$nuc_report' ...\n"; + open (NUC,$nuc_report) or die "Couldn't read from file $nuc_report: $!\n\n"; + + my %nucs; # storing nucleotides and frequencies + my $linecount = 0; + + while (){ + chomp; + $_ =~ s/\r//; # removing carriage returns + # warn "$_\n"; sleep(1); + my ($element,$count_obs,$observed,$count_exp,$expected,$coverage) = (split /\t/); + # warn "$element , $count_obs , $observed , $count_exp , $expected, $coverage\n"; sleep(1); + if ($linecount == 0){ # verifying that the data appears to be a Bismark nucleotide coverage report + if ($observed eq 'percent sample'){ + # warn "Fine, found '$observed'\n"; + } + else{ + die "Expected to find 'percent sample' as entry in line 1, column 3 but found '$observed'. This doesn't look like a Bismark nucleotide coverage report. Please respecify!\n"; + } + + if ($expected eq 'percent genomic'){ + # warn "Fine, found '$expected'\n"; + } + else{ + die "Expected to find 'percent genomic' as entry in line 1, column 5 but found '$expected'. This doesn't look like a Bismark nucleotide coverage report. Please respecify!\n"; + } + } + else{ + $nucs{$element}->{obs}->{percent} = $observed; + $nucs{$element}->{exp}->{percent} = $expected; + $nucs{$element}->{obs}->{counts} = $count_obs; + $nucs{$element}->{exp}->{counts} = $count_exp; + $nucs{$element}->{obs}->{coverage} = $coverage; # coverage of that nucleotide in the sample + warn "Element '$element' observed: $observed\n" if $verbose; + warn "Element '$element' expected: $expected\n" if $verbose; + } + + ++$linecount; + + } + + # Checking if we got all we need + my $looksOK = 1; + foreach my $key (keys %nucs){ + unless ( (defined $nucs{$key}->{obs}) and (defined $nucs{$key}->{exp})){ + $looksOK = 0; + } + } + + if ($looksOK){ + warn "Got all necessary information, editing HTML report ...\n" if $verbose; + my $minmax = 0; + foreach my $key (sort {$a cmp $b} keys %nucs){ + my $nuc_obs = $nucs{$key}->{obs}->{percent}; + my $nuc_exp = $nucs{$key}->{exp}->{percent}; + my $counts_obs = $nucs{$key}->{obs}->{counts}; + my $counts_exp = $nucs{$key}->{exp}->{counts}; + my $cov = $nucs{$key}->{obs}->{coverage}; + + # calculating log2 observed/expected + my $ratio = $nuc_obs/$nuc_exp; + # my $logratio = sprintf ("%.2f",log($ratio)/log(2)); + # if (abs($logratio) > $minmax){ + # $minmax = abs($logratio); + # } + warn "$key\tnuc_${key}_obs\t$nuc_obs\tnuc_${key}_exp\t$nuc_exp\tratio: $ratio\n" if $verbose; + + $doc =~ s/\{\{nuc_${key}_p_obs\}\}/$nuc_obs/g; + $doc =~ s/\{\{nuc_${key}_p_exp\}\}/$nuc_exp/g; + $doc =~ s/\{\{nuc_${key}_counts_obs\}\}/$counts_obs/g; + $doc =~ s/\{\{nuc_${key}_counts_exp\}\}/$counts_exp/g; + $doc =~ s/\{\{nuc_${key}_coverage\}\}/$cov/g; + } + # warn "Minimum/maxium ratio was: $minmax\n" if $verbose; + # $doc =~ s/\{\{nuc_minmax\}\}/$minmax/g; + } + else{ + warn "Something went wrong, skipping this plot entirely... Use --verbose to get a clue...\n"; + # skipping this plot entirely if values could not be extracted + return $doc; + } + + warn "Complete\n\n"; + return $doc; +} + + +sub read_splitting_report{ + + my ($splitting_report,$doc) = @_; + + warn "Processing splitting report $splitting_report ...\n"; + open (SPLIT,$splitting_report) or die "Couldn't read from file $splitting_report: $!\n\n"; + + my $total_seqs; + + my $total_C_count; + my ($meth_CpG,$meth_CHG,$meth_CHH,$meth_unknown); + my ($unmeth_CpG,$unmeth_CHG,$unmeth_CHH,$unmeth_unknown); + my ($perc_CpG,$perc_CHG,$perc_CHH,$perc_unknown); + + while (){ + chomp; + + ### Context Methylation + if($_ =~ /^Total number of C/ ){ + (undef,$total_C_count) = split /\t/; + print "total calls >> $total_C_count <<\n" if ($verbose); + } + + elsif($_ =~ /^Total methylated C\'s in CpG context:/ ){ + (undef,$meth_CpG) = split /\t/; + print "meth CpG >> $meth_CpG <<\n" if ($verbose); + } + elsif($_ =~ /^Total methylated C\'s in CHG context:/ ){ + (undef,$meth_CHG) = split /\t/; + print "meth CHG>> $meth_CHG <<\n" if ($verbose); + } + elsif($_ =~ /^Total methylated C\'s in CHH context:/ ){ + (undef,$meth_CHH) = split /\t/; + print "meth CHH >> $meth_CHH <<\n" if ($verbose); + } + elsif($_ =~ /^Total methylated C\'s in Unknown context:/ ){ + (undef,$meth_unknown) = split /\t/; + print "meth Unknown >> $meth_unknown <<\n" if ($verbose); + } + + elsif($_ =~ /^Total C to T conversions in CpG context:/ ){ + (undef,$unmeth_CpG) = split /\t/; + print "unmeth CpG >> $unmeth_CpG <<\n" if ($verbose); + } + elsif($_ =~ /^Total C to T conversions in CHG context:/ ){ + (undef,$unmeth_CHG) = split /\t/; + print "unmeth CHG >> $unmeth_CHG <<\n" if ($verbose); + } + elsif($_ =~ /^Total C to T conversions in CHH context:/ ){ + (undef,$unmeth_CHH) = split /\t/; + print "unmeth CHH >> $unmeth_CHH <<\n" if ($verbose); + } + elsif($_ =~ /^Total C to T conversions in Unknown context:/ ){ + (undef,$unmeth_unknown) = split /\t/; + print "unmeth Unknown >> $unmeth_unknown <<\n" if ($verbose); + } + + elsif($_ =~ /^C methylated in CpG context:/ ){ + (undef,$perc_CpG) = split /\t/; + $perc_CpG =~ s/%//; + print "percentage CpG >> $perc_CpG <<\n" if ($verbose); + } + elsif($_ =~ /^C methylated in CHG context:/ ){ + (undef,$perc_CHG) = split /\t/; + $perc_CHG =~ s/%//; + print "percentage CHG >> $perc_CHG <<\n" if ($verbose); + } + elsif($_ =~ /^C methylated in CHH context:/ ){ + (undef,$perc_CHH) = split /\t/; + $perc_CHH =~ s/%//; + print "percentage CHH >> $perc_CHH <<\n" if ($verbose); + } + elsif($_ =~ /^C methylated in Unknown context:/ ){ + (undef,$perc_unknown) = split /\t/; + $perc_unknown =~ s/%//; + print "percentage unknown >> $perc_unknown <<\n" if ($verbose); + } + } + + if (defined $meth_CpG and defined $meth_CHG and defined $meth_CHH and defined $unmeth_CpG and defined $unmeth_CHG and defined $unmeth_CHH){ + warn "Got all necessary information, editing HTML report ...\n" if ($verbose); + + ### Context Methylation + $doc =~ s/\{\{total_C_count_splitting\}\}/$total_C_count/g; + + $doc =~ s/\{\{meth_CpG_splitting\}\}/$meth_CpG/g; + $doc =~ s/\{\{meth_CHG_splitting\}\}/$meth_CHG/g; + $doc =~ s/\{\{meth_CHH_splitting\}\}/$meth_CHH/g; + + $doc =~ s/\{\{unmeth_CpG_splitting\}\}/$unmeth_CpG/g; + $doc =~ s/\{\{unmeth_CHG_splitting\}\}/$unmeth_CHG/g; + $doc =~ s/\{\{unmeth_CHH_splitting\}\}/$unmeth_CHH/g; + + unless (defined $perc_CpG){ + $perc_CpG = 'N/A'; + } + unless (defined $perc_CHG){ + $perc_CHG = 'N/A'; + } + unless (defined $perc_CHH){ + $perc_CHH = 'N/A'; + } + unless (defined $perc_unknown){ + $perc_unknown = 'N/A'; + } + + ### Unknown sequence context, just for Bowtie 2 alignments + my $meth_unknown_inject; + my $unmeth_unknown_inject; + my $perc_unknown_inject; + + if (defined $meth_unknown){ # if one Unknown context file is present, so should the others + $meth_unknown_inject = " + Methylated C's in Unknown context + $meth_unknown + "; + $unmeth_unknown_inject = " + Unmethylated C's in Unknown context + $unmeth_unknown + "; + $perc_unknown_inject = " + Methylated C's in Unknown context + $perc_unknown% + "; + } + else{ + $meth_unknown_inject = $unmeth_unknown_inject = $perc_unknown_inject = ''; + } + + ### injecting this into the table + $doc =~ s/\{\{meth_unknown_splitting\}\}/$meth_unknown_inject/g; + $doc =~ s/\{\{unmeth_unknown_splitting\}\}/$unmeth_unknown_inject/g; + $doc =~ s/\{\{perc_unknown_splitting\}\}/$perc_unknown_inject/g; + + # for the graph we need to take care that there are no N/A values in the percentage fields + my ($perc_CpG_graph, $perc_CHG_graph,$perc_CHH_graph,$perc_unknown_graph); + + if ($perc_CpG eq 'N/A'){ + $perc_CpG_graph = 0; # values of 0 won't show in the graph and won't produce errors + } + else{ + $perc_CpG_graph = $perc_CpG; + } + + if ($perc_CHG eq 'N/A'){ + $perc_CHG_graph = 0; # values of 0 won't show in the graph and won't produce errors + } + else{ + $perc_CHG_graph = $perc_CHG; + } + + if ($perc_CHH eq 'N/A'){ + $perc_CHH_graph = 0; # values of 0 won't show in the graph and won't produce errors + } + else{ + $perc_CHH_graph = $perc_CHH; + } + + if ($perc_unknown eq 'N/A'){ + $perc_unknown_graph = 0; # values of 0 won't show in the graph and won't produce errors + } + else{ + $perc_unknown_graph = $perc_unknown; + } + + $doc =~ s/\{\{perc_CpG_graph_splitting\}\}/$perc_CpG_graph/g; + $doc =~ s/\{\{perc_CHG_graph_splitting\}\}/$perc_CHG_graph/g; + $doc =~ s/\{\{perc_CHH_graph_splitting\}\}/$perc_CHH_graph/g; + + $doc =~ s/\{\{perc_CpG_splitting\}\}/$perc_CpG/g; + $doc =~ s/\{\{perc_CHG_splitting\}\}/$perc_CHG/g; + $doc =~ s/\{\{perc_CHH_splitting\}\}/$perc_CHH/g; + } + else{ + warn "Am I missing something? Try using --verbose to get a clue...\n\n"; + } + warn "Complete\n\n"; + + return $doc; + +} + + +sub read_mbias_report{ + + my ($mbias_report,$doc) = @_; + + warn "Processing M-bias report $mbias_report ...\n"; + open (MBIAS,$mbias_report) or die "Couldn't read from file $mbias_report: $!\n\n"; + + my %mbias_1; + my %mbias_2; + + my $context; + my $read_identity; + my $state = 'single'; # setting this to 'single' if there is no read 2 + + while (){ + chomp; + if ($_ =~ /^(C.{2}) context/){ + $context = $1; + + if ($_ =~ /R2/){ + $read_identity = 2; + $state = 'paired'; + } + else{ + $read_identity = 1; + } + + # warn "new context is: $context\n"; + # warn "Read identity is: Read $read_identity\n"; + } + if ($_ =~ /^\d/){ + my ($pos,$meth,$unmeth,$perc,$coverage) = (split /\t/); + if ($read_identity == 1){ + push @{$mbias_1{$context}->{coverage}}, "[$pos, $coverage]"; + push @{$mbias_1{$context}->{perc}}, "[$pos, $perc]"; + } + elsif ($read_identity == 2){ + push @{$mbias_2{$context}->{coverage}}, "[$pos, $coverage]"; + push @{$mbias_2{$context}->{perc}}, "[$pos, $perc]"; + } + else{ + warn "read identity was unknown : '$read_identity'\n\n"; + } + + # print join (" ",$pos,$meth,$unmeth,$perc,$coverage)."\n"; + } + } + + # Read 1 M-bias + my $r1_CpG_coverage = join (',',@{$mbias_1{'CpG'}->{coverage}}); + my $r1_CpG_perc = join (',',@{$mbias_1{'CpG'}->{perc}}); + + my $r1_CHG_coverage = join (',',@{$mbias_1{'CHG'}->{coverage}}); + my $r1_CHG_perc = join (',',@{$mbias_1{'CHG'}->{perc}}); + + my $r1_CHH_coverage = join (',',@{$mbias_1{'CHH'}->{coverage}}); + my $r1_CHH_perc = join (',',@{$mbias_1{'CHH'}->{perc}}); + + $doc =~ s/\{\{CpG_total_calls_R1\}\}/$r1_CpG_coverage/g; + $doc =~ s/\{\{CHG_total_calls_R1\}\}/$r1_CHG_coverage/g; + $doc =~ s/\{\{CHH_total_calls_R1\}\}/$r1_CHH_coverage/g; + + $doc =~ s/\{\{CpG_methylation_R1\}\}/$r1_CpG_perc/g; + $doc =~ s/\{\{CHG_methylation_R1\}\}/$r1_CHG_perc/g; + $doc =~ s/\{\{CHH_methylation_R1\}\}/$r1_CHH_perc/g; + + # Read 2 M-bias + if (%mbias_2){ + my $r2_CpG_coverage = join (',',@{$mbias_2{'CpG'}->{coverage}}); + my $r2_CpG_perc = join (',',@{$mbias_2{'CpG'}->{perc}}); + + my $r2_CHG_coverage = join (',',@{$mbias_2{'CHG'}->{coverage}}); + my $r2_CHG_perc = join (',',@{$mbias_2{'CHG'}->{perc}}); + + my $r2_CHH_coverage = join (',',@{$mbias_2{'CHH'}->{coverage}}); + my $r2_CHH_perc = join (',',@{$mbias_2{'CHH'}->{perc}}); + + $doc =~ s/\{\{CpG_total_calls_R2\}\}/$r2_CpG_coverage/g; + $doc =~ s/\{\{CHG_total_calls_R2\}\}/$r2_CHG_coverage/g; + $doc =~ s/\{\{CHH_total_calls_R2\}\}/$r2_CHH_coverage/g; + + $doc =~ s/\{\{CpG_methylation_R2\}\}/$r2_CpG_perc/g; + $doc =~ s/\{\{CHG_methylation_R2\}\}/$r2_CHG_perc/g; + $doc =~ s/\{\{CHH_methylation_R2\}\}/$r2_CHH_perc/g; + } + warn "Complete\n\n"; + + return ($state,$doc); + +} + + +sub read_report_template{ + my $doc; + warn "Attempting to open file from: $Bin/bismark_sitrep.tpl\n\n" if ($verbose); + open (DOC,"$Bin/bismark_sitrep.tpl") or die $!; + while(){ + chomp; + $_ =~ s/\r//g; + $doc .= $_."\n"; + } + + close DOC or die $!; + return $doc; +} + + + +sub process_commandline{ + my $help; + my $output_dir; + my $manual_output_file; + my $alignment_report; + my $dedup_report; + my $splitting_report; + my $mbias_report; + my $nucleotide_coverage_report; # stores nucleotide coverage statistics + my $verbose; + + my $version; + + my $command_line = GetOptions ('help|man' => \$help, + 'dir=s' => \$output_dir, + 'o|output=s' => \$manual_output_file, + 'alignment_report=s' => \$alignment_report, + 'dedup_report=s' => \$dedup_report, + 'splitting_report=s' => \$splitting_report, + 'mbias_report=s' => \$mbias_report, + 'nucleotide_report=s' => \$nucleotide_coverage_report, + 'version' => \$version, + 'verbose' => \$verbose, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($version){ + print << "VERSION"; + + + Bismark HTML Report Module + + bismark2report version: $bismark2report_version + Copyright 2010-16 Felix Krueger + Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + ### OUTPUT DIR PATH + if (defined $output_dir){ + unless ($output_dir eq ''){ # if the output dir has been passed on by the methylation extractor and is an empty string we don't want to change it + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + } + else{ + $output_dir = ''; + } + + + ## First we are looking for alignment reports, and then look whether there are any optional plots with the same base name + + if ($alignment_report){ + ### we only process the one alignment report (and possibly the other ones as well) that was specified + push @alignment_reports, $alignment_report; + } + else{ ### no alignment report specified, looking in the current directory for files ending in *E_report.txt (SE or PE) + + ### looking in the current directory for report files. Less than 1 report file is not allowed + @alignment_reports = <*E_report.txt>; + + if (scalar @alignment_reports == 0){ + warn "Found no potential alignment reports in the current directory. Please specify a single Bismark alignment report file using the option '--alignment_report FILE'\n\n"; + print_helpfile(); + exit; + } + else{ + # there are Bismark alignment report(s) in the directory + warn "Found ",scalar @alignment_reports," alignment reports in current directory. Now trying to figure out whether there are corresponding optional reports\n"; + } + } + + ### Ensuring that there isn't more than 1 file in @alignment_reports if someone manually specified an output file. + if (scalar @alignment_reports > 1){ + if (defined $manual_output_file){ + die "You cannot run bismark2report on more than 1 file while specifying a single output file. Either lose the option -o to derive the output filenames automatically, or specify a single Bismark alignment report file using the option '--alignment_report FILE'\n\n"; + } + } + + foreach my $aln (@alignment_reports){ + + # warn "Figuring things out for:\n$aln\n"; + $aln =~ /^(.+)_(P|S)E_report.txt$/; + my $basename = $1; + # warn "using this basename:\n$basename\n\n"; + + ### DEDUPLICATION REPORT (optional) + if ($dedup_report){ + warn "User specifified dedup report: $dedup_report\n";sleep(3); + if (lc$dedup_report eq 'none'){ + push @dedup_reports, ''; # user may wish to skip this step by specifying 'none' + } + else{ + push @dedup_reports, $dedup_report; + } + } + else{ + ### looking in the current directory for a report file with the same base name + my @dedup_report_files = <$basename*deduplication_report.txt>; + # warn "Number of deduplication reports found in current directory for basename $1: ", scalar @dedup_report_files , "\n"; + + if (scalar @dedup_report_files > 1){ + die "Found ", scalar @dedup_report_files ," potential deduplication reports with the same basename ($basename) in the current directory. Please specify a single report using the option '--dedup_report FILE' or otherwise provide filenames that are easier to figure out...\n\n"; + } + elsif (scalar @dedup_report_files == 0){ + push @dedup_reports, ''; # no corresponding deduplication report found + } + else{ + # there is only a single deduplication report in the current directory, using this one + $dedup_report = shift @dedup_report_files; + push @dedup_reports, $dedup_report; + # warn "Going to use this dedup report: $dedup_report\n\n"; + } + } + + ### NUCLEOTIDE COVERAGE REPORT (optional) + if (defined $nucleotide_coverage_report){ + # warn "User specified nucleotide coverage report: $nucleotide_coverage_report\n";sleep(1); + if (lc$nucleotide_coverage_report eq 'none'){ + push @nuc_reports, ''; # user may wish to skip this step by specifying 'none' + } + else{ + push @nuc_reports, $nucleotide_coverage_report; + } + } + else{ + ### looking in the current directory for a report file with the same base name + my @nucleotide_coverage_report_files = <$basename*nucleotide_stats.txt>; + # warn "Number of nucleotide coverage statistic reports found in current directory for basename $1: ", scalar @nucleotide_coverage_report_files , "\n"; + + if (scalar @nucleotide_coverage_report_files > 1){ + die "Found ", scalar @nucleotide_coverage_report_files ," potential nucleotide coverage reports with the same basename ($basename) in the current directory. Please specify a single report using the option '--nucleotide_report FILE' or otherwise provide filenames that are easier to figure out...\n\n"; + } + elsif (scalar @nucleotide_coverage_report_files == 0){ + # warn "Found no corresponding nucleotide coverage file\n"; + push @nuc_reports, ''; # no corresponding nucleotide coverge file report found + } + else{ + # there is only a single nucleotide coverage report in the current directory, using this one + $nucleotide_coverage_report = shift @nucleotide_coverage_report_files; + push @nuc_reports, $nucleotide_coverage_report; + # warn "Going to use this nucleotide coverage report: $nucleotide_coverage_report\n\n"; sleep(3); + } + } + + + ### METHYLATION EXTRACTOR SPLITTING REPORT + if ($splitting_report){ + if (lc$splitting_report eq 'none'){ + push @splitting_reports, ''; # user may wish to skip this step by specifying 'none' + } + else{ + push @splitting_reports, $splitting_report; + } + } + else{ + ### looking in the current directory for a report file with the same basename + my @splitting_report_files = <$basename*splitting_report.txt>; + # warn "Number of splitting reports found in current directory: ", scalar @splitting_report_files , "\n"; + + if (scalar @splitting_report_files > 1){ + die "Found ", scalar @splitting_report_files ," potential methylation extractor splitting reports with the same basename ($basename) in the current directory. Please specify a single report using the option '--splitting_report FILE' or otherwise provide filenames that are easier to figure out...\n\n"; + } + elsif (scalar @splitting_report_files == 0){ + push @splitting_reports, ''; # no corresponding methylation extractor report found + } + else{ + # there is only a single splitting report in the current directory, using this one + $splitting_report = shift @splitting_report_files; + push @splitting_reports, $splitting_report; + } + } + + ### M-BIAS REPORT + if ($mbias_report){ + if (lc$mbias_report eq 'none'){ + $mbias_report = ''; # user may wish to skip this step by specifying 'none' + push @mbias_reports, $mbias_report; + } + else{ + push @mbias_reports, $mbias_report; + } + } + else{ + ### looking in the current directory for a single report file. More (or less) than 1 report file is not allowed + my @mbias_report_files = <$basename*M-bias.txt>; + + # warn "Number of M-bias reports found in current directory: ", scalar @mbias_report_files , "\n"; + + if (scalar @mbias_report_files > 1){ + die "Found ", scalar @mbias_report_files ," potential M-bias reports with the same basename ($basename) in the current directory. Please specify a single report using the option '--mbias_report FILE'or otherwise provide filenames that are easier to figure out automatically ...\n\n"; + } + elsif (scalar @mbias_report_files == 0){ + push @mbias_reports, ''; + } + else{ + # there is only a single M-bias report in the current directory, using this one + $mbias_report = shift @mbias_report_files; + push @mbias_reports, $mbias_report; + } + } + $dedup_report = $splitting_report = $mbias_report = $nucleotide_coverage_report = undef; + } + + return ($output_dir,$verbose,$manual_output_file); + +} + +sub print_helpfile{ + print < Name of the output file (optional). If not specified explicitly, the output filename will be derived + from the Bismark alignment report file. Specifying an output filename only works if the HTML report is + to be generated for a single Bismark alignment report (and potentially additional reports). + +--dir Output directory. Output is written to the current directory if not specified explicitly. + + +--alignment_report FILE If not specified explicitly, bismark2report attempts to find Bismark report file(s) in the current + directory and produces a separate HTML report for each mapping report file. Based on the basename of + the Bismark mapping report, bismark2report will also attempt to find the other Bismark reports (see below) + for inclusion into the HTML report. Specifying a Bismark alignment report file is mandatory. + +--dedup_report FILE If not specified explicitly, bismark2report attempts to find a deduplication report file with the same + basename as the Bismark mapping report (generated by deduplicate_bismark) in the + current working directory. Including a deduplication report is optional, and using the FILE 'none' + will skip this step entirely. + +--splitting_report FILE If not specified explicitly, bismark2report attempts to find a splitting report file with the same + basename as the Bismark mapping report (generated by the Bismark methylation extractor) in the current + working directory. Including a splitting report is optional, and using the FILE 'none' will skip this + step entirely. + +--mbias_report FILE If not specified explicitly, bismark2report attempts to find a single M-bias report file with the same + basename as the Bismark mapping report (generated by the Bismark methylation extractor) in the current + working directory. Including an M-Bias report is optional, and using the FILE 'none' will skip this step + entirely. + +--nucleotide_report FILE If not specified explicitly, bismark2report attempts to find a single nucleotide coverage report file + with the same basename as the Bismark mapping report (generated by Bismark with the option + '--nucleotide_coverage') in the current working directory. Including a nucleotide coverage statistics + report is optional, and using the FILE 'none' will skip this report entirely. + + Script last modified: 13 May 2016 + +EOF + ; + exit 1; +} + diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_pretty_report/bismark2report_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_pretty_report/bismark2report_wrapper.py Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,122 @@ +#!/usr/bin/python + +import argparse +import os +import shutil +import subprocess +import sys +import tempfile +import logging + +def cleanup_before_exit(tmp_dir): + if tmp_dir and os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + +def get_arg(): + parser = argparse.ArgumentParser() + parser.add_argument('--tool_dir', dest='tool_dir', action='store', nargs=1, metavar='tool_dir', type=str) + parser.add_argument('--alignment_report', dest='alignment_report', action='store', nargs=1, metavar='alignment_report', type=str) + parser.add_argument('--dedup_report', dest='dedup_report', action='store', nargs=1, metavar='dedup_report', type=str) + parser.add_argument('--splitting_report', dest='splitting_report', action='store', nargs=1, metavar='splitting_report', type=str) + parser.add_argument('--mbias_report', dest='mbias_report', action='store', nargs=1, metavar='mbias_report', type=str) + parser.add_argument('--nucleotide_report', dest='nucleotide_report', action='store', nargs=1, metavar='nucleotide_report', type=str) + parser.add_argument('--output_html_report', dest='output_html_report', action='store', nargs=1, metavar='output_html_report', type=str) + parser.add_argument('--output_html_report_link', dest='output_html_report_link', action='store', nargs=1, metavar='output_html_report_link', type=str) + parser.add_argument('--log_report', dest='log_report', action='store', nargs=1, metavar='log_report', type=str) + parser.add_argument('--output_dir', dest='job_dir', action='store', nargs=1, metavar='job_dir', type=str) + args = parser.parse_args() + return args + +def create_and_write_html_link(job_dir, output_html_report_link, tmp_dir): + """ + Web browsers don't allow to open a link pointing to the absolute path of a local html file FROM a website page; + The only way to make such link functional is to integrate the local file inside the web structure of the site. + Galaxy has been designed such that the child_dir of the output_dir is considered as the root + of the html base tag (i.e for the current job running. + The function proceeds the following steps: + #1. Extracts the galaxy dir where the output files are stored + #2. Creating a child dir in this output_dir is needed because it is considered as the root of the html base tag + # We can extract the exact name of this child dir from the jobs_directory name + #3. Moves the html file in this child_dir + """ + output_path_list = output_html_report_link.split('/') + output_path = '/'.join(output_path_list[0:-1]) + html_root = job_dir.split('/')[-1] + final_dir = os.path.join(output_path, html_root) + os.makedirs(final_dir) + shutil.move(os.path.join(tmp_dir, 'html_report'), os.path.join(final_dir, 'html_report.html')) + + html_report = open(output_html_report_link, 'wb') + html_report.write('\n') + html_report.write('\n') + html_report.write('\t\n') + html_report.write('\t\t\n') + html_report.write('\t\tLink to Bismark Pretty Report Page\n') + html_report.write('') + html_report.close() + +def __main__(): + args = get_arg() + + tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') + + if args.log_report: + logging.basicConfig(level=logging.INFO, filename=args.log_report[0], filemode="a+", format='%(message)s') + else: + logging.basicConfig(level=logging.INFO, filename=os.path.join(tmp_dir, 'log_report.txt'), filemode="a+", format='%(message)s') + + alignment_option = '--alignment_report' + alignment_report = args.alignment_report[0] + if args.dedup_report: + dedup_option = '--dedup_report' + dedup_report = args.dedup_report[0] + else: + dedup_option = '' + dedup_report = '' + if args.splitting_report: + splitting_option = '--splitting_report' + splitting_report = args.splitting_report[0] + else: + splitting_option = '' + splitting_report = '' + if args.mbias_report: + mbias_option = '--mbias_report' + mbias_report = args.mbias_report[0] + else: + mbias_option = '' + mbias_report = '' + if args.nucleotide_report: + nucleotide_option = '--nucleotide_report' + nucleotide_report = args.nucleotide_report[0] + else: + nucleotide_option = '' + nucleotide_report = '' + + proc = subprocess.Popen(['perl', os.path.join(args.tool_dir[0], 'bismark2report'), alignment_option, alignment_report, dedup_option, dedup_report,\ + splitting_option, splitting_report, mbias_option, mbias_report, nucleotide_option, nucleotide_report,\ + '--dir', tmp_dir, '--output', 'html_report'],\ + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc_out, proc_err = proc.communicate() + + cmd = 'perl %s %s %s %s %s %s %s %s %s %s %s --output html_report --dir %s'\ + % (os.path.join(args.tool_dir[0], 'bismark2report'), alignment_option, alignment_report, dedup_option, dedup_report,\ + splitting_option, splitting_report, mbias_option, mbias_report, nucleotide_option, nucleotide_report, tmp_dir) + + logging.info('COMMAND LINE:\n\n%s' % cmd) + logging.info("__________________________________________________________________\n") + logging.info("BISMARK PRETTY REPORT STDOUT:\n\n%s" % proc_out) + if proc_err: + logging.critical("__________________________________________________________________\n") + logging.critical("BISMARK PRETTY REPORT ERROR:\n\n%s" % proc_err) + sys.exit("Bismark pretty report crashed with the folowing error message:\n%s" % proc_err) + + if args.output_html_report: + shutil.copy(os.path.join(tmp_dir, 'html_report'), args.output_html_report[0]) + + #This function writes a link towards the Bismark html page inside an html file. + #This is needed because the direct visualization of the Bismark html report via Galaxy is ugly + create_and_write_html_link(args.job_dir[0], args.output_html_report_link[0], tmp_dir) + + cleanup_before_exit(tmp_dir) + +if __name__=="__main__": __main__() \ No newline at end of file diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_pretty_report/bismark2report_wrapper.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_pretty_report/bismark2report_wrapper.xml Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,75 @@ + + + Generates a graphical HTML report page from report outputs of Bismark + + + + samtools + bowtie2 + + + + + + + + + + + + + + + +
+ + + + +
+
+ + + + + + + + + + + + 10.1093/bioinformatics/btr167 + +
diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_pretty_report/bismark_sitrep.tpl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bismark_pretty_report/bismark_sitrep.tpl Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,807 @@ + + + + + Bismark Processing Report - {{filename}} + + + + + + +
+
+ Bismark +

Bismark Processing Report

+
+

{{filename}}

+

Data processed at {{time}} on {{date}}

+
+
+
+ +

Alignment

+ + + + + + + + + + + + + + + + + + + + + + + + + +
{{sequences_analysed_in_total}}{{total_sequences_alignments}}
{{unique_seqs_text}}{{unique_seqs}}
{{no_alignments_text}}{{no_alignments}}
{{multiple_alignments_text}}{{multiple_alignments}}
Genomic sequence context not extractable (edges of chromosomes){{no_genomic}}
+
+ +
+ + +

Cytosine Methylation

+ + + + + + + + + + + + + + + + + + + + + {{meth_unknown}} + + + + + + + + + + + + + + + {{unmeth_unknown}} + + + + + + + + + + + + + + + {{perc_unknown}} + +
Total C's analysed{{total_C_count}}
Methylated C's in CpG context{{meth_CpG}}
Methylated C's in CHG context{{meth_CHG}}
Methylated C's in CHH context{{meth_CHH}}
Unmethylated C's in CpG context{{unmeth_CpG}}
Unmethylated C's in CHG context{{unmeth_CHG}}
Unmethylated C's in CHH context{{unmeth_CHH}}
Percentage methylation (CpG context){{perc_CpG}}%
Percentage methylation (CHG context){{perc_CHG}}%
Percentage methylation (CHH context){{perc_CHH}}%
+
+ +
+ +

Alignment to Individual Bisulfite Strands

+ + + + + + + + + + + + + + + + + + + + + + + +
OT{{number_OT}}original top strand
CTOT{{number_CTOT}}complementary to original top strand
CTOB{{number_CTOB}}complementary to original bottom strand
OB{{number_OB}}original bottom strand
+
+ +
+ + {{start_deletion_duplication}} + +

Deduplication

+ + + + + + + + + + + + + + + + + + + + +
Alignments analysed{{seqs_total_duplicates}}
Unique alignments{{unique_alignments_duplicates}}
Duplicates removed{{duplicate_alignments_duplicates}}
Duplicated alignments were found at {{different_positions_duplicates}} different positions
+
+
+ + {{end_deletion_duplication}} + + {{start_deletion_splitting}} + +

Cytosine Methylation after Extraction

+ + + + + + + + + + + + + + + + + + + + + {{meth_unknown_splitting}} + + + + + + + + + + + + + + + {{unmeth_unknown_splitting}} + + + + + + + + + + + + + + + {{perc_unknown_splitting}} + +
Total C's analysed{{total_C_count_splitting}}
Methylated C's in CpG context{{meth_CpG_splitting}}
Methylated C's in CHG context{{meth_CHG_splitting}}
Methylated C's in CHH context{{meth_CHH_splitting}}
Unmethylated C's in CpG context{{unmeth_CpG_splitting}}
Unmethylated C's in CHG context{{unmeth_CHG_splitting}}
Unmethylated C's in CHH context{{unmeth_CHH_splitting}}
Percentage methylation (CpG context){{perc_CpG_splitting}}%
Percentage methylation (CHG context){{perc_CHG_splitting}}%
Percentage methylation (CHH context){{perc_CHH_splitting}}%
+
+
+ + {{end_deletion_splitting}} + + {{start_deletion_nucleotide_coverage}} + +

Nucleotide Coverage

+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Nucleotide Class Counts Sample Counts Genome% in Sample % in Genome fold coverage
A {{nuc_A_counts_obs}} {{nuc_A_counts_exp}} {{nuc_A_p_obs}} {{nuc_A_p_exp}} {{nuc_A_coverage}}
T {{nuc_T_counts_obs}} {{nuc_T_counts_exp}} {{nuc_T_p_obs}} {{nuc_T_p_exp}} {{nuc_T_coverage}}
C {{nuc_C_counts_obs}} {{nuc_C_counts_exp}} {{nuc_C_p_obs}} {{nuc_C_p_exp}} {{nuc_C_coverage}}
G {{nuc_G_counts_obs}} {{nuc_G_counts_exp}} {{nuc_G_p_obs}} {{nuc_G_p_exp}} {{nuc_G_coverage}}
AC {{nuc_AC_counts_obs}} {{nuc_AC_counts_exp}} {{nuc_AC_p_obs}} {{nuc_AC_p_exp}} {{nuc_AC_coverage}}
CA {{nuc_CA_counts_obs}} {{nuc_CA_counts_exp}} {{nuc_CA_p_obs}} {{nuc_CA_p_exp}} {{nuc_CA_coverage}}
TC {{nuc_TC_counts_obs}} {{nuc_TC_counts_exp}} {{nuc_TC_p_obs}} {{nuc_TC_p_exp}} {{nuc_TC_coverage}}
CT {{nuc_CT_counts_obs}} {{nuc_CT_counts_exp}} {{nuc_CT_p_obs}} {{nuc_CT_p_exp}} {{nuc_CT_coverage}}
CC {{nuc_CC_counts_obs}} {{nuc_CC_counts_exp}} {{nuc_CC_p_obs}} {{nuc_CC_p_exp}} {{nuc_CC_coverage}}
CG {{nuc_CG_counts_obs}} {{nuc_CG_counts_exp}} {{nuc_CG_p_obs}} {{nuc_CG_p_exp}} {{nuc_CG_coverage}}
GC {{nuc_GC_counts_obs}} {{nuc_GC_counts_exp}} {{nuc_GC_p_obs}} {{nuc_GC_p_exp}} {{nuc_GC_coverage}}
GG {{nuc_GG_counts_obs}} {{nuc_GG_counts_exp}} {{nuc_GG_p_obs}} {{nuc_GG_p_exp}} {{nuc_GG_coverage}}
AG {{nuc_AG_counts_obs}} {{nuc_AG_counts_exp}} {{nuc_AG_p_obs}} {{nuc_AG_p_exp}} {{nuc_AG_coverage}}
GA {{nuc_GA_counts_obs}} {{nuc_GA_counts_exp}} {{nuc_GA_p_obs}} {{nuc_GA_p_exp}} {{nuc_GA_coverage}}
TG {{nuc_TG_counts_obs}} {{nuc_TG_counts_exp}} {{nuc_TG_p_obs}} {{nuc_TG_p_exp}} {{nuc_TG_coverage}}
GT {{nuc_GT_counts_obs}} {{nuc_GT_counts_exp}} {{nuc_GT_p_obs}} {{nuc_GT_p_exp}} {{nuc_GT_coverage}}
TT {{nuc_TT_counts_obs}} {{nuc_TT_counts_exp}} {{nuc_TT_p_obs}} {{nuc_TT_p_exp}} {{nuc_TT_coverage}}
TA {{nuc_TA_counts_obs}} {{nuc_TA_counts_exp}} {{nuc_TA_p_obs}} {{nuc_TA_p_exp}} {{nuc_TA_coverage}}
AT {{nuc_AT_counts_obs}} {{nuc_AT_counts_exp}} {{nuc_AT_p_obs}} {{nuc_AT_p_exp}} {{nuc_AT_coverage}}
AA {{nuc_AA_counts_obs}} {{nuc_AA_counts_exp}} {{nuc_AA_p_obs}} {{nuc_AA_p_exp}} {{nuc_AA_coverage}}
+
+
+ {{end_deletion_nucleotide_coverage}} + + {{start_deletion_mbias}} + + +

M-Bias Plot

+
+ + {{start_deletion_mbias_2}} + +

 

+
+
+ + {{end_deletion_mbias_2}} + + {{end_deletion_mbias}} + +
+ Babraham Bioinformatics +

Analysis produced by Bismark (version {{bismark_version}}) - a tool to map bisulfite converted sequence reads and determine cytosine methylation states

+

Report graphs rendered using jQuery and Highcharts. Page design by Phil Ewels.

+
+ +
+ + + diff -r 0f8646f22b8d -r fcadce4d9a06 bismark_wrapper.py --- a/bismark_wrapper.py Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,373 +0,0 @@ -#!/usr/bin/env python - -import argparse -import os -import shutil -import subprocess -import sys -import shlex -import tempfile -import fileinput -import fileinput -from glob import glob - -def stop_err( msg ): - sys.stderr.write( "%s\n" % msg ) - sys.exit() - -def __main__(): - - #Parse Command Line - parser = argparse.ArgumentParser(description='Wrapper for the bismark bisulfite mapper.') - parser.add_argument( '-p', '--num-threads', dest='num_threads', - type=int, default=4, help='Use this many threads to align reads. The default is 4.' ) - - parser.add_argument( '--bismark_path', dest='bismark_path', help='Path to the bismark perl scripts' ) - - parser.add_argument( '--bowtie2', action='store_true', default=False, help='Running bismark with bowtie2 and not with bowtie.' ) - - # input options - parser.add_argument( '--own-file', dest='own_file', help='' ) - parser.add_argument( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) - parser.add_argument( '-O', '--output', dest='output' ) - - - parser.add_argument( '--output-report-file', dest='output_report_file' ) - parser.add_argument( '--suppress-header', dest='suppress_header', action="store_true" ) - - parser.add_argument( '--mate-paired', dest='mate_paired', action='store_true', help='Reads are mate-paired', default=False) - - - parser.add_argument( '-1', '--mate1', dest='mate1', - help='The forward reads file in Sanger FASTQ or FASTA format.' ) - parser.add_argument( '-2', '--mate2', dest='mate2', - help='The reverse reads file in Sanger FASTQ or FASTA format.' ) - parser.add_argument( '--sort-bam', dest='sort_bam', action="store_true" ) - - parser.add_argument( '--output-unmapped-reads', dest='output_unmapped_reads', - help='Additional output file with unmapped reads (single-end).' ) - parser.add_argument( '--output-unmapped-reads-l', dest='output_unmapped_reads_l', - help='File name for unmapped reads (left, paired-end).' ) - parser.add_argument( '--output-unmapped-reads-r', dest='output_unmapped_reads_r', - help='File name for unmapped reads (right, paired-end).' ) - - - parser.add_argument( '--output-suppressed-reads', dest='output_suppressed_reads', - help='Additional output file with suppressed reads (single-end).' ) - parser.add_argument( '--output-suppressed-reads-l', dest='output_suppressed_reads_l', - help='File name for suppressed reads (left, paired-end).' ) - parser.add_argument( '--output-suppressed-reads-r', dest='output_suppressed_reads_r', - help='File name for suppressed reads (right, paired-end).' ) - parser.add_argument( '--stdout', dest='output_stdout', - help='File name for the standard output of bismark.' ) - - - parser.add_argument( '--single-paired', dest='single_paired', - help='The single-end reads file in Sanger FASTQ or FASTA format.' ) - - parser.add_argument( '--fastq', action='store_true', help='Query filetype is in FASTQ format') - parser.add_argument( '--fasta', action='store_true', help='Query filetype is in FASTA format') - parser.add_argument( '--phred64-quals', dest='phred64', action="store_true" ) - - - parser.add_argument( '--skip-reads', dest='skip_reads', type=int ) - parser.add_argument( '--qupto', type=int) - - - # paired end options - parser.add_argument( '-I', '--minins', dest='min_insert' ) - parser.add_argument( '-X', '--maxins', dest='max_insert' ) - parser.add_argument( '--no-mixed', dest='no_mixed', action="store_true" ) - parser.add_argument( '--no-discordant', dest='no_discordant', action="store_true" ) - - #parse general options - # default 20 - parser.add_argument( '--seed-len', dest='seed_len', type=int) - # default 15 - parser.add_argument( '--seed-extention-attempts', dest='seed_extention_attempts', type=int ) - # default 0 - parser.add_argument( '--seed-mismatches', dest='seed_mismatches', type=int ) - # default 2 - parser.add_argument( '--max-reseed', dest='max_reseed', type=int ) - """ - # default 70 - parser.add_argument( '--maqerr', dest='maqerr', type=int ) - """ - - """ - The number of megabytes of memory a given thread is given to store path - descriptors in --best mode. Best-first search must keep track of many paths - at once to ensure it is always extending the path with the lowest cumulative - cost. Bowtie tries to minimize the memory impact of the descriptors, but - they can still grow very large in some cases. If you receive an error message - saying that chunk memory has been exhausted in --best mode, try adjusting - this parameter up to dedicate more memory to the descriptors. Default: 512. - """ - parser.add_argument( '--chunkmbs', type=int, default=512 ) - - args = parser.parse_args() - - # Create bismark index if necessary. - index_dir = "" - if args.own_file: - """ - Create a temporary index with the offered files from the user. - Utilizing the script: bismark_genome_preparation - bismark_genome_preparation --bowtie2 hg19/ - """ - tmp_index_dir = tempfile.mkdtemp() - index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( args.own_file )[1].split( '.' )[:-1] ) ) - try: - """ - Create a hard link pointing to args.own_file named 'index_path'.fa. - """ - os.symlink( args.own_file, index_path + '.fa' ) - except Exception, e: - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - stop_err( 'Error in linking the reference database.\n' + str( e ) ) - # bismark_genome_preparation needs the complete path to the folder in which the database is stored - if args.bowtie2: - cmd_index = 'bismark_genome_preparation --bowtie2 %s ' % ( tmp_index_dir ) - else: - cmd_index = 'bismark_genome_preparation %s ' % ( tmp_index_dir ) - if args.bismark_path: - if os.path.exists(args.bismark_path): - # add the path to the bismark perl scripts, that is needed for galaxy - cmd_index = os.path.join(args.bismark_path, cmd_index) - else: - # assume the same directory as that script - cmd_index = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd_index) - try: - tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name - tmp_stderr = open( tmp, 'wb' ) - proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stdout=open(os.devnull, 'wb'), stderr=tmp_stderr.fileno() ) - returncode = proc.wait() - tmp_stderr.close() - # get stderr, allowing for case where it's very large - tmp_stderr = open( tmp, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - tmp_stderr.close() - if returncode != 0: - raise Exception, stderr - except Exception, e: - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - stop_err( 'Error indexing reference sequence\n' + str( e ) ) - index_dir = tmp_index_dir - else: - # bowtie path is the path to the index directory and the first path of the index file name - index_dir = os.path.dirname( args.index_path ) - - # Build bismark command - - """ - Bismark requires a large amount of temporary disc space. If that is not available, for example on a cluster you can hardcode the - TMP to some larger space. It's not recommended but it works. - """ - #tmp_bismark_dir = tempfile.mkdtemp( dir='/data/0/galaxy_db/tmp/' ) - tmp_bismark_dir = tempfile.mkdtemp() - output_dir = os.path.join( tmp_bismark_dir, 'results') - cmd = 'bismark %(args)s --bam --temp_dir %(tmp_bismark_dir)s --gzip -o %(output_dir)s --quiet %(genome_folder)s %(reads)s' - - if args.fasta: - # the query input files (specified as mate1,mate2 or singles) are FastA - cmd = '%s %s' % (cmd, '--fasta') - elif args.fastq: - cmd = '%s %s' % (cmd, '--fastq') - - if args.bismark_path: - # add the path to the bismark perl scripts, that is needed for galaxy - if os.path.exists(args.bismark_path): - cmd = os.path.join(args.bismark_path, cmd) - else: - # assume the same directory as that script - cmd = 'perl %s' % os.path.join(os.path.realpath(os.path.dirname(__file__)), cmd) - - arguments = { - 'genome_folder': index_dir, - 'args': '', - 'tmp_bismark_dir': tmp_bismark_dir, - 'output_dir': output_dir, - } - - additional_opts = '' - # Set up the reads - if args.mate_paired: - # paired-end reads library - reads = '-1 %s ' % ( args.mate1 ) - reads += ' -2 %s ' % ( args.mate2 ) - additional_opts += ' -I %s -X %s ' % (args.min_insert, args.max_insert) - else: - # single paired reads library - reads = ' %s ' % ( args.single_paired ) - - - if not args.bowtie2: - # use bowtie specific options - #additional_opts += ' --best ' # bug in bismark, --best is not available as option. Only --non-best, best-mode is activated by default - if args.seed_mismatches: - # --seedmms - additional_opts += ' -n %s ' % args.seed_mismatches - if args.seed_len: - # --seedlen - additional_opts += ' -l %s ' % args.seed_len - - # alignment options - if args.bowtie2: - additional_opts += ' -p %s --bowtie2 ' % (int(args.num_threads/2)) #divides by 2 here since bismark will spawn 2 (original top and original bottom) jobs with -p threads each - if args.seed_mismatches: - additional_opts += ' -N %s ' % args.seed_mismatches - if args.seed_len: - additional_opts += ' -L %s ' % args.seed_len - if args.seed_extention_attempts: - additional_opts += ' -D %s ' % args.seed_extention_attempts - if args.max_reseed: - additional_opts += ' -R %s ' % args.max_reseed - if args.no_discordant: - additional_opts += ' --no-discordant ' - if args.no_mixed: - additional_opts += ' --no-mixed ' - """ - if args.maqerr: - additional_opts += ' --maqerr %s ' % args.maqerr - """ - if args.skip_reads: - additional_opts += ' --skip %s ' % args.skip_reads - if args.qupto: - additional_opts += ' --qupto %s ' % args.qupto - if args.phred64: - additional_opts += ' --phred64-quals ' - if args.suppress_header: - additional_opts += ' --sam-no-hd ' - if args.output_unmapped_reads or ( args.output_unmapped_reads_l and args.output_unmapped_reads_r): - additional_opts += ' --un ' - if args.output_suppressed_reads or ( args.output_suppressed_reads_l and args.output_suppressed_reads_r): - additional_opts += ' --ambiguous ' - - arguments.update( {'args': additional_opts, 'reads': reads} ) - - # Final bismark command: - cmd = cmd % arguments - # Run - try: - tmp_out = tempfile.NamedTemporaryFile().name - tmp_stdout = open( tmp_out, 'wb' ) - tmp_err = tempfile.NamedTemporaryFile().name - tmp_stderr = open( tmp_err, 'wb' ) - proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) - returncode = proc.wait() - - if returncode != 0: - tmp_stdout.close() - tmp_stderr.close() - # get stderr, allowing for case where it's very large - tmp_stderr = open( tmp_err, 'rb' ) - stderr = '' - buffsize = 1048576 - try: - while True: - stderr += tmp_stderr.read( buffsize ) - if not stderr or len( stderr ) % buffsize != 0: - break - except OverflowError: - pass - - raise Exception, stderr - tmp_stdout.close() - tmp_stderr.close() - - # TODO: look for errors in program output. - except Exception, e: - stop_err( 'Error in bismark:\n' + str( e ) ) - - # collect and copy output files - if args.output_report_file: - output_report_file = open(args.output_report_file, 'w+') - for line in fileinput.input(glob( os.path.join( output_dir, '*report.txt') )): - output_report_file.write(line) - output_report_file.close() - - - if args.output_suppressed_reads: - shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads.txt'))[0], args.output_suppressed_reads ) - if args.output_suppressed_reads_l: - shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_1.txt'))[0], args.output_suppressed_reads_l ) - if args.output_suppressed_reads_r: - shutil.move( glob(os.path.join( output_dir, '*ambiguous_reads_2.txt'))[0], args.output_suppressed_reads_r ) - - if args.output_unmapped_reads: - shutil.move( glob(os.path.join( output_dir, '*unmapped_reads.txt'))[0], args.output_unmapped_reads ) - if args.output_unmapped_reads_l: - shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_1.txt'))[0], args.output_unmapped_reads_l ) - if args.output_unmapped_reads_r: - shutil.move( glob(os.path.join( output_dir, '*unmapped_reads_2.txt'))[0], args.output_unmapped_reads_r ) - - try: - """ - merge all bam files - """ - #tmp_out = tempfile.NamedTemporaryFile( dir=output_dir ).name - tmp_stdout = open( tmp_out, 'wab' ) - #tmp_err = tempfile.NamedTemporaryFile( dir=output_dir ).name - tmp_stderr = open( tmp_err, 'wab' ) - - tmp_res = tempfile.NamedTemporaryFile( dir= output_dir).name - - bam_files = glob( os.path.join( output_dir, '*.bam') ) - if len( bam_files ) > 1: - cmd = 'samtools merge -@ %s -f %s %s ' % ( args.num_threads, tmp_res, ' '.join( bam_files ) ) - - proc = subprocess.Popen( args=shlex.split( cmd ), stdout=subprocess.PIPE ) - - returncode = proc.wait() - tmp_stdout.close() - tmp_stderr.close() - if returncode != 0: - raise Exception, open( tmp_stderr.name ).read() - else: - tmp_res = bam_files[0] - - bam_path = "%s" % tmp_res - - if os.path.exists( bam_path ): - if args.sort_bam: - cmd = 'samtools sort -@ %s %s sorted_bam' % (args.num_threads, bam_path) - proc = subprocess.Popen( args=shlex.split( cmd ) ) - returncode = proc.wait() - if returncode != 0: - raise Exception("Error during '%s'" % cmd) - shutil.move( 'sorted_bam.bam', args.output ) - else: - shutil.move( bam_path, args.output ) - else: - stop_err( 'BAM file no found:\n' + str( bam_path ) ) - - - # TODO: look for errors in program output. - except Exception, e: - stop_err( 'Error in merging bam files:\n' + str( e ) ) - - - if args.output_stdout: - # copy the temporary saved stdout from bismark - shutil.move( tmp_out, args.output_stdout ) - - # Clean up temp dirs - if args.own_file: - if os.path.exists( tmp_index_dir ): - shutil.rmtree( tmp_index_dir ) - if os.path.exists( tmp_bismark_dir ): - shutil.rmtree( tmp_bismark_dir ) - if os.path.exists( output_dir ): - shutil.rmtree( output_dir ) - -if __name__=="__main__": __main__() diff -r 0f8646f22b8d -r fcadce4d9a06 documentation/Bismark_User_Guide.pdf Binary file documentation/Bismark_User_Guide.pdf has changed diff -r 0f8646f22b8d -r fcadce4d9a06 documentation/readme.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/documentation/readme.rst Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,53 @@ +=============== +Bismark Wrapper +=============== + +Bismark_ uses Bowtie or Bowtie2 to map bisulfite converted sequence reads to a reference genome and determine cytosine methylation states. + +Publication: http://www.ncbi.nlm.nih.gov/pubmed/21493656 + +User Guide: http://www.bioinformatics.babraham.ac.uk/projects/bismark/Bismark_User_Guide_v0.7.12.pdf + +.. _bismark: http://www.bioinformatics.babraham.ac.uk/projects/bismark/ + +Preparation +=========== + +Create your reference index with *bismark_genome_preparation* in your normal Galaxy Bowtie2/Botwie index directory. It will create a Bisulfite_Genome folder directly in your Bowtie2/Bowtie index directory. +If you follow that approach you do not need to specify or modify an extra .loc file. +That wrapper will extract the path to the Bisulfite_Genome folder from ./tool-data/bowtie2_indices.loc or ./tool-data/bowtie_indices.loc. + +======= +History +======= + +- v0.7: Initial public release +- v0.7.8: update and add Tool Shed Integration +- v0.7.11.1 change default output to BAM, from now on samtools are required +- v0.7.11.2 added multi-threading to samtools (samtools > 0.1.19 is required) +- v0.7.12 upgrade to bismark 0.7.12 and fix a major slowdown +- v0.7.12.1 define a dependency to samtools 0.1.19 + + +=============================== +Wrapper Licence (MIT/BSD style) +=============================== + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. + diff -r 0f8646f22b8d -r fcadce4d9a06 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,65 @@ + + + 0.14 + + + samtools + + + + + + + + + bismark --version + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1093/bioinformatics/btr167 + + + diff -r 0f8646f22b8d -r fcadce4d9a06 new/bismark --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/new/bismark Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,9636 @@ +#!/usr/bin/perl -- +use strict; +use warnings; +use IO::Handle; +use Cwd; +$|++; +use Getopt::Long; + + +## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + + +my $parent_dir = getcwd; +my $bismark_version = 'v0.14.3'; +my $command_line = join (" ",@ARGV); + + +### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail +foreach my $arg (@ARGV){ + if ($arg eq '--solexa1.3-quals'){ + $arg = '--phred64-quals'; + } +} +my @filenames; # will be populated by processing the command line + +my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index,$multicore) = process_command_line(); + +my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment +my %chromosomes; # stores the chromosome sequences of the mouse genome +my %SQ_order; # stores the order of sequences in the reference. This is to produce SAM/BAM files with a known order of chromosomes +my %counting; # counting various events +my @pids; # storing the process IDs of child processes in parallel mode + + +my $seqID_contains_tabs; +my $verbose = 0; + +if ($multicore > 1){ + warn "Running Bismark Parallel version. Number of parallel instances to be spawned: $multicore\n\n"; +} + + +sub multi_process_handling{ + + my $offset = 1; + my $process_id; + if ($multicore > 1){ + + until ($offset == $multicore){ + # warn "multicore: $multicore\noffset: $offset\n"; + my $fork = fork; + + if (defined $fork){ + if ($fork != 0){ + $process_id = $fork; + push @pids, $process_id; + if ($offset < $multicore){ + ++$offset; + # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n"; + } + else{ + # warn "Reached the number of maximum multicores. Proceeeding to processing...\n"; + } + } + elsif ($fork == 0){ + # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n"; + $process_id = $fork; + last; + } + } + else{ + die "Forking unsuccessful. Proceeding using a single thread only\n"; + } + } + + # warn "\nThe Thread Identity\n===================\n"; + if ($process_id){ + # print "I am the parent process. My children are called:\n"; + # print join ("\t",@pids),"\n"; + # print "I am going to process the following line count: $offset\n\n"; + } + elsif($process_id == 0){ + # warn "I am a child process: Process ID: $process_id\n"; + # warn "I am going to process the following line count: $offset\n\n"; + } + else{ + die "Process ID was: '$process_id'\n"; + } + } + else{ + warn "Single-core mode: setting pid to 1\n"; + $process_id = 1; + } + + return ($process_id,$offset); +} + + +sub subset_input_file_FastQ{ + + my ($filename,$process_id,$offset) = @_; + + if ($filename =~ /gz$/){ + open (OFFSET,"zcat $filename |") or die "Couldn't read from file '$filename': $!\n"; + } + else{ + open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n"; + } + + # warn "offset is $offset\n"; + my $temp = $filename; + $temp .= ".temp.$offset"; + $temp =~ s/^.*\///; # replacing everything upto and including the last /, i.e. removing file path information + + if ($gzip){ + $temp .= '.gz'; + open (TEMPFQ,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n"; + } + else{ + open (TEMPFQ,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n"; + } + + my $line_count = 0; + + while (1){ + my $l1 = ; + my $l2 = ; + my $l3 = ; + my $l4 = ; + + last unless ($l4); + ++$line_count; + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + print TEMPFQ "$l1$l2$l3$l4"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + } + + close OFFSET or warn $!; + close TEMPFQ or warn "Failed to close file handle TEMPFQ: $!\n"; + + warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n"; + + return ($temp); # returning the subset filename + +} + +sub subset_input_file_FastA{ + + my ($filename,$process_id,$offset) = @_; + + if ($filename =~ /gz$/){ + open (OFFSET,"zcat $filename |") or die "Couldn't read from file '$filename': $!\n"; + } + else{ + open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n"; + } + + # warn "offset is $offset\n"; + my $temp = $filename; + $temp .= ".temp.$offset"; + + if ($gzip){ + $temp .= '.gz'; + open (TEMPFQ,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n"; + } + else{ + open (TEMPFQ,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n"; + } + + warn "Writing temporary infile to $temp\n"; + + my $line_count = 0; + + while (1){ + my $l1 = ; + my $l2 = ; + + last unless ($l2); + ++$line_count; + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + print TEMPFQ "$l1$l2"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + } + + close OFFSET or warn $!; + close TEMPFQ or warn "Failed to close file handle TEMPFQ: $!\n"; + + warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n"; + + return ($temp); # returning the subset filename + +} + +##### +##### + +foreach my $filename (@filenames){ + + my $original_filename = $filename; + my $original_filename_1; + my $original_filename_2; + + chdir $parent_dir or die "Unable to move to initial working directory'$parent_dir' $!\n"; + ### resetting the counting hash and fhs + reset_counters_and_fhs($filename); + @pids = (); + $seqID_contains_tabs = 0; + + ### As of version 0.14.0 we support multi-threading. In a first instance we accomplish this by + ### splitting the input file(s) into several smaller subfiles and merging the results back at + ### the end. + + # get general settings (also for single-threaded use) + my ($pid,$offset) = multi_process_handling (); + + my ($single_end,$paired_end); + ### PAIRED-END ALIGNMENTS + if ($filename =~ ','){ + + $single_end = 0; + $paired_end = 1; + + my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file + + $fhs[0]->{name} = 'CTread1GAread2CTgenome'; + $fhs[1]->{name} = 'GAread1CTread2GAgenome'; + $fhs[2]->{name} = 'GAread1CTread2CTgenome'; + $fhs[3]->{name} = 'CTread1GAread2GAgenome'; + warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n"; + + my ($filename_1,$filename_2) = (split (/,/,$filename)); + $original_filename_1 = $filename_1; + $original_filename_2 = $filename_2; + + warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n"; + + ### subsetting the input file(s) + unless ($multicore == 1){ # not needed in single-core mode + # warn "My PID: $pid\nMy offset: $offset\n"; + if ($sequence_file_format eq 'FASTA'){ + my $temp_filename_1 = subset_input_file_FastA($filename_1,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n"; + $filename_1 = "${temp_dir}$temp_filename_1"; + + my $temp_filename_2 = subset_input_file_FastA($filename_2,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n"; + $filename_2 = "${temp_dir}$temp_filename_2"; + } + else{ # FastQ format, default + my $temp_filename_1 = subset_input_file_FastQ($filename_1,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n"; + $filename_1 = "${temp_dir}$temp_filename_1"; + + my $temp_filename_2 = subset_input_file_FastQ($filename_2,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n"; + $filename_2 = "${temp_dir}$temp_filename_2"; + } + } + + ### additional variables only for paired-end alignments + my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file + + ### FastA format + if ($sequence_file_format eq 'FASTA'){ + warn "Input files are in FastA format\n"; + + if ($directional){ + ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + + if ($bowtie2){ + paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + else{ + paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + } + + ### FastQ format + else{ + warn "Input files are in FastQ format\n"; + if ($directional){ + if ($bowtie2){ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ # Bowtie 1 alignments + if ($gzip){ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format + $fhs[0]->{inputfile_2} = undef; # no longer needed + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format + $fhs[3]->{inputfile_2} = undef; # no longer needed + } + else{ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + } + } + elsif($pbat){ # PBAT-Seq. This works for both Bowtie and Bowtie 2 + ### At the moment we are only performing alignments only with uncompressed FastQ files + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = undef; + $fhs[0]->{inputfile_2} = undef; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = undef; + $fhs[3]->{inputfile_2} = undef; + } + else{ + if ($bowtie2){ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ # Bowtie 1 alignments + if ($gzip){ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files + } + else{ # uncompressed temp files + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + } + } + if ($bowtie2){ + paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + else{ + paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + } + start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + } + + ### Else we are performing SINGLE-END ALIGNMENTS + else{ + warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n"; + + $single_end = 1; + $paired_end = 0; + + ### subsetting the input file(s) + unless ($multicore == 1){ # not needed in single-core mode + # warn "My PID: $pid\nMy offset: $offset\n"; + if ($sequence_file_format eq 'FASTA'){ + my $temp_filename = subset_input_file_FastA($filename,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n"; + $filename = "${temp_dir}$temp_filename"; + } + else{ # FastQ format, default + my $temp_filename = subset_input_file_FastQ($filename,$pid,$offset); + warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n"; + $filename = "${temp_dir}$temp_filename"; + } + } + + ### Initialising bisulfite conversion filenames + my ($C_to_T_infile,$G_to_A_infile); + + ### FastA format + if ($sequence_file_format eq 'FASTA'){ + warn "Inut file is in FastA format\n"; + if ($directional){ + ($C_to_T_infile) = biTransformFastAFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + } + else{ + ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; + } + + ### Creating 4 different bowtie filehandles and storing the first entry + if ($bowtie2){ + single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile); + } + } + + ## FastQ format + else{ + warn "Input file is in FastQ format\n"; + if ($directional){ + ($C_to_T_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + } + elsif($pbat){ + ($G_to_A_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files + } + else{ + ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; + } + + ### Creating up to 4 different bowtie filehandles and storing the first entry + if ($pbat){ + if ($bowtie2){ # as of version 0.10.2 we also support PBAT alignments for Bowtie 2 + single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 (undef,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile); + } + } + elsif ($bowtie2){ + single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile); + } + } + + start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile,$pid); + + } + + ### MERGING AND DELETING TEMP FILES // TIDYING UP AFTER A MULTICORE PROCESS + + if ($pid){ # only performing this for the parent process + + if ($multicore > 1){ + + warn "Now waiting for all child processes to complete\n"; + + ### we need to ensure that we wait for all child processes to be finished before continuing + # warn "here are the child IDs: @pids\n"; + # warn "Looping through the child process IDs:\n"; + + foreach my $id (@pids){ + # print "$id\t"; + my $kid = waitpid ($id,0); + # print "Returned: $kid\nExit status: $?\n"; + unless ($? == 0){ + warn "\nChild process terminated with exit signal: '$?'\n\n"; + } + } + + # regenerating names for temporary files + my @temp_input; + my @temp_output; + my @temp_reports; + my @temp_unmapped_1; # will store single end reads or R1 of paired-end + my @temp_unmapped_2; + my @temp_ambiguous_1; # will store single end reads or R1 of paired-end + my @temp_ambiguous_2; + + for (1..$offset){ + + # Temp Input Files + if ($single_end){ + if ($gzip){ + push @temp_input, "${original_filename}.temp.${_}.gz"; + } + else{ + push @temp_input, "${original_filename}.temp.${_}"; + } + + } + elsif($paired_end){ + if ($gzip){ + push @temp_input, "${original_filename_1}.temp.${_}.gz"; + push @temp_input, "${original_filename_2}.temp.${_}.gz"; + } + else{ + push @temp_input, "${original_filename_1}.temp.${_}"; + push @temp_input, "${original_filename_2}.temp.${_}"; + } + } + + # if files had a prefix we need to specify it + my $add_prefix; + if (defined $prefix){ + $add_prefix = "${prefix}."; + } + else{ + $add_prefix = ''; + } + + # Temp Output Files + if ($single_end){ + + if ($bowtie2){ + if ($gzip){ + push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2_SE_report.txt"; + } + else{ + push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2_SE_report.txt"; + } + } + else{ + if ($gzip){ + push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_SE_report.txt"; + } + else{ + push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_SE_report.txt"; + } + } + + if ($unmapped){ + if ($gzip){ + push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_unmapped_reads.fq"; + } + else{ + push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_unmapped_reads.fq"; + } + } + + if ($ambiguous){ + if ($gzip){ + push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_ambiguous_reads.fq"; + } + else{ + push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_ambiguous_reads.fq"; + } + } + + } + elsif($paired_end){ + if ($bowtie2){ + if ($gzip){ + push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_pe.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_PE_report.txt"; + } + else{ + push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_pe.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_PE_report.txt"; + } + } + else{ + if ($gzip){ + push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_pe.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_PE_report.txt"; + } + else{ + push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_pe.bam"; + push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_PE_report.txt"; + } + } + + if ($unmapped){ + if ($gzip){ + push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_unmapped_reads_1.fq"; + push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_unmapped_reads_2.fq"; + } + else{ + push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_unmapped_reads_1.fq"; + push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_unmapped_reads_2.fq"; + } + } + + if ($ambiguous){ + if ($gzip){ + push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_ambiguous_reads_1.fq"; + push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_ambiguous_reads_2.fq"; + } + else{ + push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_ambiguous_reads_1.fq"; + push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_ambiguous_reads_2.fq"; + } + } + + } + } + + warn "\n\nRight, cleaning up now...\n\n"; + + # deleting temp files; + warn "Deleting temporary sequence files...\n"; + foreach my $temp (@temp_input){ + #print "$temp\t"; + $temp =~ s/.*\///; # deleting path information + print "${temp_dir}${temp}\t"; + unlink "${temp_dir}${temp}" or warn "Failed to delete temporary FastQ file ${temp_dir}$temp: $!\n"; + } + print "\n\n"; + + # merging temp BAM files + if ($single_end){ + merge_individual_BAM_files(\@temp_output,$original_filename,$single_end); + } + else{ + merge_individual_BAM_files(\@temp_output,$original_filename_1,$single_end); + } + + # deleting temp BAM files + warn "Deleting temporary BAM files...\n"; + foreach my $temp (@temp_output){ + # print "$temp\t"; + $temp =~ s/.*\///; # deleting path information + print "${output_dir}${temp}\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary BAM file ${output_dir}${temp}: $!\n"; + } + print "\n\n"; + + if ($unmapped){ + if ($single_end){ + merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename,$single_end); + } + else{ + merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename_1,$single_end,'_1'); + merge_individual_unmapped_files(\@temp_unmapped_2,$original_filename_2,$single_end,'_2'); + } + + # deleting temp unmapped files + warn "Deleting temporary unmapped files...\n"; + foreach my $temp (@temp_unmapped_1){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n"; + } + if ($paired_end){ + foreach my $temp (@temp_unmapped_2){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n"; + } + } + print "\n\n"; + + } + + if ($ambiguous){ + if ($single_end){ + merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename,$single_end); + } + else{ + merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename_1,$single_end,'_1'); + merge_individual_ambiguous_files(\@temp_ambiguous_2,$original_filename_2,$single_end,'_2'); + } + + # deleting temp ambiguous files + warn "Deleting temporary ambiguous files...\n"; + foreach my $temp (@temp_ambiguous_1){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n"; + } + + if ($paired_end){ + foreach my $temp (@temp_ambiguous_2){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n"; + } + } + print "\n\n"; + } + + # resetting the counters once more so we can add all data from all temporary reports + reset_counters_and_fhs($original_filename); + + ### Merging the Bismark mapping report files + if ($single_end){ + merge_individual_splitting_reports(\@temp_reports,$original_filename,$single_end); + print_final_analysis_report_single_end('mock_file1','mock_file_2','mock_pid','mergeThis'); + } + else{ + merge_individual_splitting_reports(\@temp_reports,$original_filename_1,$single_end,$original_filename_2); + print_final_analysis_report_paired_ends('mock_file1','mock_file_2','mock_file3','mock_file_4','mock_pid','mergeThis'); + } + + # deleting temp report files + warn "Deleting temporary report files...\n"; + foreach my $temp (@temp_reports){ + print "$temp\t"; + unlink "${output_dir}${temp}" or warn "Failed to delete temporary report file $output_dir$temp: $!\n"; + } + print "\n\n"; + + } + + } + + if ($pid){ # only for the Parent + warn "\n====================\nBismark run complete\n====================\n\n"; + } + +} + +sub merge_individual_splitting_reports{ + + my ($temp_reports,$original_filename_1,$single_end,$original_filename_2) = @_; + my $report_file = $original_filename_1; + $report_file =~ s/.*\///; # removing path information + if ($prefix){ + $report_file = "${prefix}.${report_file}"; + } + + if ($basename){ # Output file basename is set using the -B argument + $report_file = ${basename}; + } + + if ($single_end){ + if ($bowtie2){ + $report_file .= '_bismark_bt2_SE_report.txt'; + } + else{ + $report_file .= '_bismark_SE_report.txt'; + } + } + else{ + if ($bowtie2){ + $report_file .= '_bismark_bt2_PE_report.txt'; + } + else{ + $report_file .= '_bismark_PE_report.txt'; + } + } + warn "Writing report to ${output_dir}${report_file}\n"; + open (REPORT,'>',"$output_dir$report_file") or die "Failed to write to ${output_dir}${report_file}: $!\n"; + + foreach my $temp(@$temp_reports){ + $temp =~ s/.*\///; # removing path information + } + + warn "Now merging temporary reports @$temp_reports into >>> ${output_dir}${report_file} <<<\n"; + + if ($single_end){ + print REPORT "Bismark report for: $original_filename_1 (version: $bismark_version)\n"; + } + else{ # paired-end + print REPORT "Bismark report for: $original_filename_1 and $original_filename_2 (version: $bismark_version)\n"; + } + + + my $first = 0; + + foreach my $temp(@$temp_reports){ + # $temp =~ s/.*\///; # removing path information + + warn "Merging from file >> $temp <<\n"; + open (IN,"${output_dir}${temp}") or die "Failed to read from temporary mapping report '${output_dir}${temp}'\n"; + + ### this is printing the first couple of lines + while (){ + chomp; + if ($_ =~ /^Bismark report/){ + next; + } + + unless ($first){ # only happens for the first run we are processing + if ($_ =~ /^Final Alignment/){ + ++$first; + last; + } + else{ + print REPORT "$_\n"; + } + } + } + close IN or warn "Failed to close filehandle\n"; + + ### Simon says: You are going to regret this in the future. Just for the record. He might be right... + read_alignment_report($temp,$single_end); + + } + warn "\n"; + +} + +sub read_alignment_report{ + my ($report,$single_end) = @_; + + my $unique; + my $no_aln; + my $multiple; + my $no_genomic; + my $total_seqs; + my $bismark_version; + my $input_filename; + + my $unique_text; + my $no_aln_text; + my $multiple_text; + my $total_seq_text; + + my $total_C_count; + my ($meth_CpG,$meth_CHG,$meth_CHH,$meth_unknown); + my ($unmeth_CpG,$unmeth_CHG,$unmeth_CHH,$unmeth_unknown); + + my $number_OT; + my $number_CTOT; + my $number_CTOB; + my $number_OB; + + open (ALN,"${output_dir}${report}") or die "Failed to read from temporary mapping report '$output_dir$report'\n"; + + while (){ + chomp; + + ### General Alignment stats + if ($_ =~ /^Sequence pairs analysed in total:/ ){ ## Paired-end + (undef,$total_seqs) = split /\t/; + # warn "Total paired seqs: >> $total_seqs <<\n"; + } + elsif ($_ =~ /^Sequences analysed in total:/ ){ ## Single-end + (undef,$total_seqs) = split /\t/; + # warn "total single-end seqs >> $total_seqs <<\n"; + } + + elsif($_ =~ /^Number of paired-end alignments with a unique best hit:/){ ## Paired-end + (undef,$unique) = split /\t/; + # warn "Unique PE>> $unique <<\n"; + } + elsif($_ =~ /^Number of alignments with a unique best hit from/){ ## Single-end + (undef,$unique) = split /\t/; + # warn "Unique SE>> $unique <<\n"; + } + + elsif($_ =~ /^Sequence pairs with no alignments under any condition:/){ ## Paired-end + (undef,$no_aln) = split /\t/; + # warn "No alignment PE >> $no_aln <<\n"; + } + elsif($_ =~ /^Sequences with no alignments under any condition:/){ ## Single-end + (undef,$no_aln) = split /\t/; + # warn "No alignments SE>> $no_aln <<\n"; + } + + elsif($_ =~ /^Sequence pairs did not map uniquely:/){ ## Paired-end + (undef,$multiple) = split /\t/; + # warn "Multiple alignments PE >> $multiple <<\n"; + } + elsif($_ =~ /^Sequences did not map uniquely:/){ ## Single-end + (undef,$multiple) = split /\t/; + # warn "Multiple alignments SE >> $multiple <<\n"; + } + + elsif($_ =~ /^Sequence pairs which were discarded because genomic sequence could not be extracted:/){ ## Paired-end + (undef,$no_genomic) = split /\t/; + # warn "No genomic sequence PE >> $no_genomic <<\n"; + } + elsif($_ =~ /^Sequences which were discarded because genomic sequence could not be extracted:/){ ## Single-end + (undef,$no_genomic) = split /\t/; + # warn "No genomic sequence SE>> $no_genomic <<\n"; + } + + ### Context Methylation + elsif($_ =~ /^Total number of C/ ){ + (undef,$total_C_count) = split /\t/; + # warn "Total number C >> $total_C_count <<\n"; + } + + elsif($_ =~ /^Total methylated C\'s in CpG context:/ ){ + (undef,$meth_CpG) = split /\t/; + # warn "meth CpG >> $meth_CpG <<\n" ; + } + elsif($_ =~ /^Total methylated C\'s in CHG context:/ ){ + (undef,$meth_CHG) = split /\t/; + # warn "meth CHG >> $meth_CHG <<\n" ; + } + elsif($_ =~ /^Total methylated C\'s in CHH context:/ ){ + (undef,$meth_CHH) = split /\t/; + # warn "meth CHH >> $meth_CHH <<\n" ; + } + elsif($_ =~ /^Total methylated C\'s in Unknown context:/ ){ + (undef,$meth_unknown) = split /\t/; + # warn "meth Unknown >> $meth_unknown <<\n" ; + } + + elsif($_ =~ /^Total unmethylated C\'s in CpG context:/ or $_ =~ /^Total C to T conversions in CpG context:/){ + (undef,$unmeth_CpG) = split /\t/; + # warn "unmeth CpG >> $unmeth_CpG <<\n" ; + } + elsif($_ =~ /^Total unmethylated C\'s in CHG context:/ or $_ =~ /^Total C to T conversions in CHG context:/){ + (undef,$unmeth_CHG) = split /\t/; + # warn "unmeth CHG >> $unmeth_CHG <<\n" ; + } + elsif($_ =~ /^Total unmethylated C\'s in CHH context:/ or $_ =~ /^Total C to T conversions in CHH context:/){ + (undef,$unmeth_CHH) = split /\t/; + # warn "unmeth CHH >> $unmeth_CHH <<\n"; + } + elsif($_ =~ /^Total unmethylated C\'s in Unknown context:/ or $_ =~ /^Total C to T conversions in Unknown context:/){ + (undef,$unmeth_unknown) = split /\t/; + # warn "unmeth Unknown >> $unmeth_unknown <<\n" ; + } + + ### Strand Origin + + elsif($_ =~ /^CT\/GA\/CT:/ ){ ## Paired-end + (undef,$number_OT) = split /\t/; + # warn "Number OT PE>> $number_OT <<\n" ; + } + elsif($_ =~ /^CT\/CT:/ ){ ## Single-end + (undef,$number_OT) = split /\t/; + # warn "Number OT SE>> $number_OT <<\n" ; + } + + elsif($_ =~ /^GA\/CT\/CT:/ ){ ## Paired-end + (undef,$number_CTOT) = split /\t/; + # warn "Number CTOT PE >> $number_CTOT <<\n" ; + } + elsif($_ =~ /^GA\/CT:/ ){ ## Single-end + (undef,$number_CTOT) = split /\t/; + # warn "Number CTOT SE >> $number_CTOT <<\n" ; + } + + elsif($_ =~ /^GA\/CT\/GA:/ ){ ## Paired-end + (undef,$number_CTOB) = split /\t/; + # warn "Number CTOB PE >> $number_CTOB <<\n" ; + } + elsif($_ =~ /^GA\/GA:/ ){ ## Single-end + (undef,$number_CTOB) = split /\t/; + # warn "Number CTOB SE >> $number_CTOB <<\n"; + } + + elsif($_ =~ /^CT\/GA\/GA:/ ){ ## Paired-end + (undef,$number_OB) = split /\t/; + # warn "Number OB PE >> $number_OB <<\n"; + } + elsif($_ =~ /^CT\/GA:/ ){ ## Single-end + (undef,$number_OB) = split /\t/; + # warn "Number OB SE >> $number_OB <<\n"; + } + } + + $counting{sequences_count} += $total_seqs; + $counting{unique_best_alignment_count} += $unique; + $counting{no_single_alignment_found} += $no_aln; + $counting{unsuitable_sequence_count} += $multiple; + $counting{genomic_sequence_could_not_be_extracted_count} += $no_genomic; + + $counting{total_meCHH_count} += $meth_CHH; + $counting{total_meCHG_count} += $meth_CHG; + $counting{total_meCpG_count} += $meth_CpG; + if ($bowtie2){ + $counting{total_meC_unknown_count} += $meth_unknown; + } + + $counting{total_unmethylated_CHH_count} += $unmeth_CHH; + $counting{total_unmethylated_CHG_count} += $unmeth_CHG; + $counting{total_unmethylated_CpG_count} += $unmeth_CpG; + if ($bowtie2){ + $counting{total_unmethylated_C_unknown_count} += $unmeth_unknown; + } + + if ($single_end){ + $counting{CT_CT_count} += $number_OT; + $counting{CT_GA_count} += $number_OB; + $counting{GA_CT_count} += $number_CTOT; + $counting{GA_GA_count} += $number_CTOB; + } + else{ + # paired-end + $counting{GA_CT_CT_count} += $number_CTOT; + $counting{CT_GA_CT_count} += $number_OT; + $counting{GA_CT_GA_count} += $number_CTOB; + $counting{CT_GA_GA_count} += $number_OB; + } +} + +sub merge_individual_ambiguous_files{ + + my ($temp_ambiguous,$original_filename,$single_end,$paired_information) = @_; + my $ambiguous_file = $original_filename; + $ambiguous_file =~ s/.*\///; # removing path information + + if ($prefix){ + $ambiguous_file = "${prefix}.${ambiguous_file}"; + } + + if ($single_end){ + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file = "${basename}_ambiguous_reads.fq.gz"; + } + else{ + $ambiguous_file = "${basename}_ambiguous_reads.fa.gz"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file =~ s/$/_ambiguous_reads.fq.gz/; + } + else{ + $ambiguous_file =~ s/$/_ambiguous_reads.fa.gz/; + } + } + } + else{ # paired-end + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fq.gz"; + } + else{ + $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fa.gz"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fq.gz/; + } + else{ + $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fa.gz/; + } + } + } + + foreach my $temp(@$temp_ambiguous){ + $temp =~ s/.*\///; # removing path information + } + + open (AMBIGUOUS,"| gzip -c - > $output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; + warn "Now merging ambiguous sequences @$temp_ambiguous into >>> $output_dir$ambiguous_file <<<\n"; + + foreach my $temp(@$temp_ambiguous){ + warn "Merging from file >> $temp <<\n"; + if ($temp =~ /gz$/){ + open (IN,"zcat ${output_dir}$temp |") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n"; + } + else{ + open (IN,"${output_dir}$temp") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n"; + } + + while (){ + print AMBIGUOUS; + } + close IN or warn "Failed to close filehandle\n"; + } + warn "\n"; + + close AMBIGUOUS or warn "Failed to close output filehandle AMBIGUOUS\n\n"; +} + + +sub merge_individual_unmapped_files{ + + my ($temp_unmapped,$original_filename,$single_end,$paired_information) = @_; + my $unmapped_file = $original_filename; + $unmapped_file =~ s/.*\///; # removing path information + + if ($prefix){ + $unmapped_file = "${prefix}.${unmapped_file}"; + } + + if ($single_end){ + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file = "${basename}_unmapped_reads.fq.gz"; + } + else{ + $unmapped_file = "${basename}_unmapped_reads.fa.gz"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file =~ s/$/_unmapped_reads.fq.gz/; + } + else{ + $unmapped_file =~ s/$/_unmapped_reads.fa.gz/; + } + } + } + else{ # paired-end + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file = "${basename}_unmapped_reads${paired_information}.fq.gz"; + } + else{ + $unmapped_file = "${basename}_unmapped_reads${paired_information}.fa.gz"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fq.gz/; + } + else{ + $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fa.gz/; + } + } + } + + foreach my $temp(@$temp_unmapped){ + $temp =~ s/.*\///; # removing path information + } + + open (UNMAPPED,"| gzip -c - > ${output_dir}${unmapped_file}") or die "Failed to write to ${output_dir}${unmapped_file}: $!\n"; + warn "Now merging unmapped sequences @$temp_unmapped into >>> ${output_dir}${unmapped_file} <<<\n"; + + foreach my $temp(@$temp_unmapped){ + warn "Merging from file >> $temp <<\n"; + if ($temp =~ /gz$/){ + open (IN,"zcat ${output_dir}${temp} |") or die "Failed to read from unmapped temp file '${output_dir}$temp'\n"; + } + else{ + open (IN,"${output_dir}${temp}") or die "Failed to read from unmapped temp file '${output_dir}${temp}'\n"; + } + + while (){ + print UNMAPPED; + } + close IN or warn "Failed to close filehandle\n"; + } + warn "\n"; + + close UNMAPPED or warn "Failed to close output filehandle UNMAPPED\n\n"; +} + + +sub merge_individual_BAM_files{ + + my ($tempbam,$original_filename,$single_end) = @_; + my $merged_name = $original_filename; + + # warn "merged name is: $merged_name\n"; + $merged_name =~ s/.*\///; # deleting path information + # warn "merged name is: $merged_name\n"; sleep(1); + + foreach my $temp_bam(@$tempbam){ + $temp_bam =~ s/.*\///; # deleting path information + } + + if ($prefix){ + $merged_name = "$prefix.$merged_name"; + } + + if ($single_end){ + if ($bowtie2){ # BAM format is the default for Bowtie 2 + $merged_name .= '_bismark_bt2.bam'; + } + else{ # BAM is the default output + $merged_name .= '_bismark.bam'; + } + + if ($basename){ # Output file basename is set using the -B argument + $merged_name = "${basename}.bam"; + } + } + else{ + if ($bowtie2){ # BAM format is the default for Bowtie 2 + $merged_name .= '_bismark_bt2_pe.bam'; + } + else{ # BAM is the default output + $merged_name .= '_bismark_pe.bam'; + } + + if ($basename){ # Output file basename is set using the -B argument + $merged_name = "${basename}_pe.bam"; + } + } + + warn "Now merging BAM files @$tempbam into >>> $merged_name <<<\n"; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to $merged_name: $!\n"; + my $first = 0; + + foreach my $temp_bam(@$tempbam){ + # $temp_bam =~ s/.*\///; # deleting path information + + warn "Merging from file >> $temp_bam <<\n"; + + if ($first > 0){ + open (IN,"$samtools_path view ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; + } + else{ # only for the first file we print the header as well + open (IN,"$samtools_path view -h ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; + } + + while (){ + print OUT; + } + close IN or warn "Failed to close filehandle\n"; + ++$first; + } + warn "\n"; + + close OUT or warn "Failed to close output filehandle\n\n"; +} + +sub start_methylation_call_procedure_single_ends { + my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; + my ($dir,$filename); + + if ($sequence_file =~ /\//){ + ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $sequence_file; + } + + ### printing all alignments to a results file + my $outfile = $filename; + if ($prefix){ + $outfile = "$prefix.$outfile"; + } + if ($bowtie2){ # SAM format is the default for Bowtie 2 + $outfile =~ s/$/_bismark_bt2.sam/; + } + elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) + $outfile =~ s/$/_bismark.txt/; + } + else{ # SAM is the default output + $outfile =~ s/$/_bismark.sam/; + } + + if ($basename){ # Output file basename is set using the -B argument + $outfile = "${basename}.sam"; + } + + $bam = 0 unless (defined $bam); + + if ($bam == 1){ ### Samtools is installed, writing out BAM directly + $outfile =~ s/sam$/bam/; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead + $outfile .= '.gz'; + open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ # uncompressed ouput, default + open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + + warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n"; + sleep(1); + + if ($vanilla){ + print OUT "Bismark version: $bismark_version\n"; + } + + ### printing alignment and methylation call summary to a report file + my $reportfile = $filename; + if ($prefix){ + $reportfile = "$prefix.$reportfile"; + } + if ($bowtie2){ + $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; + } + else{ + $reportfile =~ s/$/_bismark_SE_report.txt/; + } + + if ($basename){ # Output file basename is set using the -B argument + $reportfile = "${basename}_SE_report.txt"; + } + + open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; + print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; + + if ($unmapped){ + my $unmapped_file = $filename; + if ($prefix){ + $unmapped_file = "$prefix.$unmapped_file"; + } + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file = "${basename}_unmapped_reads.fq"; + } + else{ + $unmapped_file = "${basename}_unmapped_reads.fa"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_file =~ s/$/_unmapped_reads.fq/; + } + else{ + $unmapped_file =~ s/$/_unmapped_reads.fa/; + } + } + + open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; + warn "Unmapped sequences will be written to $output_dir$unmapped_file\n"; + } + + if ($ambiguous){ + my $ambiguous_file = $filename; + + if ($prefix){ + $ambiguous_file = "$prefix.$ambiguous_file"; + } + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file = "${basename}_ambiguous_reads.fq"; + } + else{ + $ambiguous_file = "${basename}_ambiguous_reads.fa"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $ambiguous_file =~ s/$/_ambiguous_reads.fq/; + } + else{ + $ambiguous_file =~ s/$/_ambiguous_reads.fa/; + } + } + open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; + warn "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; + } + + if ($directional){ + print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n"; + } + elsif ($pbat){ + print REPORT "Option '--pbat' specified: alignments to original strands (OT and OB) strands were ignored (i.e. not performed)\n"; + } + else{ + print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n"; + } + + if ($bowtie2){ + print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time + unless (%chromosomes){ + my $cwd = getcwd; # storing the path of the current working directory + print "Current working directory is: $cwd\n\n"; + read_genome_into_memory($cwd); + } + + unless ($vanilla or $sam_no_hd){ + generate_SAM_header(); + } + + ### Input file is in FastA format + if ($sequence_file_format eq 'FASTA'){ + process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid); + } + ### Input file is in FastQ format + else{ + process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid); + } +} + +sub start_methylation_call_procedure_paired_ends { + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; + + my ($dir_1,$filename_1); + + if ($sequence_file_1 =~ /\//){ + ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/; + } + else{ + $filename_1 = $sequence_file_1; + } + + my ($dir_2,$filename_2); + + if ($sequence_file_2 =~ /\//){ + ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/; + } + else{ + $filename_2 = $sequence_file_2; + } + + ### printing all alignments to a results file + my $outfile = $filename_1; + + if ($prefix){ + $outfile = "$prefix.$outfile"; + } + if ($bowtie2){ # SAM format is the default Bowtie 2 output + $outfile =~ s/$/_bismark_bt2_pe.sam/; + } + elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) + $outfile =~ s/$/_bismark_pe.txt/; + } + else{ # SAM format is the default Bowtie 1 output + $outfile =~ s/$/_bismark_pe.sam/; + } + + if ($basename){ # Output file basename is set using the -B argument + $outfile = "${basename}_pe.sam"; + } + + + $bam = 0 unless (defined $bam); + + if ($bam == 1){ ### Samtools is installed, writing out BAM directly + $outfile =~ s/sam$/bam/; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead + $outfile .= '.gz'; + open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ # uncompressed ouput, default + open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + + warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n"; + sleep(1); + + if ($vanilla){ + print OUT "Bismark version: $bismark_version\n"; + } + + ### printing alignment and methylation call summary to a report file + my $reportfile = $filename_1; + if ($prefix){ + $reportfile = "$prefix.$reportfile"; + } + + if ($bowtie2){ + $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; + } + else{ + $reportfile =~ s/$/_bismark_PE_report.txt/; + } + + if ($basename){ # Output file basename is set using the -B argument + $reportfile = "${basename}_PE_report.txt"; + } + + open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; + print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n"; + + if ($bowtie2){ + print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n"; + } + else{ + print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n"; + } + + + ### Unmapped read output + if ($unmapped){ + my $unmapped_1 = $filename_1; + my $unmapped_2 = $filename_2; + + if ($prefix){ + $unmapped_1 = "$prefix.$unmapped_1"; + $unmapped_2 = "$prefix.$unmapped_2"; + } + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_1 = "${basename}_unmapped_reads_1.fq"; + $unmapped_2 = "${basename}_unmapped_reads_2.fq"; + } + else{ + $unmapped_1 = "${basename}_unmapped_reads_1.fa"; + $unmapped_2 = "${basename}_unmapped_reads_2.fa"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $unmapped_1 =~ s/$/_unmapped_reads_1.fq/; + $unmapped_2 =~ s/$/_unmapped_reads_2.fq/; + } + else{ + $unmapped_1 =~ s/$/_unmapped_reads_1.fa/; + $unmapped_2 =~ s/$/_unmapped_reads_2.fa/; + } + } + + open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; + open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; + print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; + } + + if ($ambiguous){ + my $amb_1 = $filename_1; + my $amb_2 = $filename_2; + + if ($prefix){ + $amb_1 = "$prefix.$amb_1"; + $amb_2 = "$prefix.$amb_2"; + } + + if ($basename){ # Output file basename is set using the -B argument + if ($sequence_file_format eq 'FASTQ'){ + $amb_1 = "${basename}_ambiguous_reads_1.fq"; + $amb_2 = "${basename}_ambiguous_reads_2.fq"; + } + else{ + $amb_1 = "${basename}_ambiguous_reads_1.fa"; + $amb_2 = "${basename}_ambiguous_reads_2.fa"; + } + } + else{ + if ($sequence_file_format eq 'FASTQ'){ + $amb_1 =~ s/$/_ambiguous_reads_1.fq/; + $amb_2 =~ s/$/_ambiguous_reads_2.fq/; + } + else{ + $amb_1 =~ s/$/_ambiguous_reads_1.fa/; + $amb_2 =~ s/$/_ambiguous_reads_2.fa/; + } + } + + open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; + open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; + print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; + } + + if ($directional){ + print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n\n"; + } + elsif ($pbat){ + print REPORT "Option '--pbat' specified: alignments to original strands (OT, OB) were ignored (i.e. not performed)\n\n"; + } + else{ + print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n\n"; + } + + + + + ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time + unless (%chromosomes){ + my $cwd = getcwd; # storing the path of the current working directory + warn "Current working directory is: $cwd\n\n"; + read_genome_into_memory($cwd); + } + + unless ($vanilla or $sam_no_hd){ + generate_SAM_header(); + } + + ### Input files are in FastA format + if ($sequence_file_format eq 'FASTA'){ + process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + } + ### Input files are in FastQ format + else{ + process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + } +} + +sub print_final_analysis_report_single_end{ + my ($C_to_T_infile,$G_to_A_infile,$pid,$merge_multi) = @_; + + if ($merge_multi){ + warn "Printing a final merged alignment report for all individual sub-reports\n\n"; + } + else{ + ### All sequences from the original sequence file have been analysed now + ### deleting temporary C->T or G->A infiles + + if ($directional){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n"; + } + else{ + warn "Could not delete temporary file $C_to_T_infile properly $!\n"; + } + } + elsif ($pbat){ + my $deletion_successful = unlink "$temp_dir$G_to_A_infile"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n"; + } + else{ + warn "Could not delete temporary file $G_to_A_infile properly $!\n"; + } + } + else{ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n"; + } + else{ + warn "Could not delete temporary files properly $!\n"; + } + } + } + + ### printing a final report for the alignment procedure + print REPORT "Final Alignment report\n",'='x22,"\n"; + warn "Final Alignment report\n",'='x22,"\n"; + # foreach my $index (0..$#fhs){ + # print "$fhs[$index]->{name}\n"; + # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; + # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; + # } + + ### printing a final report for the methylation call procedure + warn "Sequences analysed in total:\t$counting{sequences_count}\n"; + print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n"; + my $percent_alignable_sequences; + + if ($counting{sequences_count} == 0){ + $percent_alignable_sequences = 0; + } + else{ + $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); + } + + warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n"; + print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n"; + + ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads), + ### only calculating the percentage if there were any overruled alignments + if ($counting{low_complexity_alignments_overruled_count}){ + my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count}); + # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n"; + } + + print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; + print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; + + print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; + print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; + + if ($directional){ + print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + } + + ### detailed information about Cs analysed + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + warn "\n"; + + warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + warn "\n"; + + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + print REPORT "\n"; + + print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + print REPORT "\n"; + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meC_unknown; + if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ + $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); + } + + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### printing methylated C percentage (CHG context) if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated C percentage (CHH context) if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + } + + ### printing methylated C percentage (Unknown C context) if applicable + if ($bowtie2){ + if ($percent_meC_unknown){ + warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; + } + } + print REPORT "\n\n"; + warn "\n\n"; + + if ($seqID_contains_tabs){ + warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; + print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; + } +} + + +sub print_final_analysis_report_paired_ends{ + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid,$merge_multi) = @_; + + if ($merge_multi){ + warn "Printing a final merged alignment report for all individual sub-reports\n\n"; + } + else{ + ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles + if ($directional){ + if ($G_to_A_infile_2){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n"; + } + else{ + warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n"; + } + } + else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n"; + } + else{ + warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n"; + } + } + } + else{ + if ($G_to_A_infile_2 and $C_to_T_infile_2){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2"; + if ($deletion_successful == 4){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n"; + } + else{ + warn "Could not delete temporary files properly: $!\n"; + } + } + else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n"; + } + else{ + warn "Could not delete temporary files properly: $!\n"; + } + } + } + } + + ### printing a final report for the alignment procedure + warn "Final Alignment report\n",'='x22,"\n"; + print REPORT "Final Alignment report\n",'='x22,"\n"; + # foreach my $index (0..$#fhs){ + # print "$fhs[$index]->{name}\n"; + # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; + # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; + # } + + ### printing a final report for the methylation call procedure + warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; + print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; + + my $percent_alignable_sequence_pairs; + if ($counting{sequences_count} == 0){ + $percent_alignable_sequence_pairs = 0; + } + else{ + $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); + } + print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n"; + print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n"; + + print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; + print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; + + + print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; + print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; + ### detailed information about Cs analysed + + if ($directional){ + print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + } + + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + warn "\n"; + + warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + warn "\n"; + + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; + } + print REPORT "\n"; + + print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; + } + print REPORT "\n"; + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meC_unknown; + if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ + $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); + } + + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### printing methylated C percentage in CHG context if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated C percentage in CHH context if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + } + + ### printing methylated C percentage (Unknown C context) if applicable + if ($bowtie2){ + if ($percent_meC_unknown){ + warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; + } + } + print REPORT "\n\n"; + warn "\n\n"; + +} + +sub process_single_end_fastA_file_for_methylation_call{ + my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; + ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. + ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either + ### the C->T or G->A version + + ### gzipped version of the infile + if ($sequence_file =~ /\.gz$/){ + open (IN,"zcat $sequence_file |") or die $!; + } + else{ + open (IN,$sequence_file) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence file $sequence_file\n"; + while (1) { + # last if ($counting{sequences_count} > 100); + my $identifier = ; + my $sequence = ; + last unless ($identifier and $sequence); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + chomp $sequence; + chomp $identifier; + + $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier); + } + else{ + $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1 + } + + unless ($return){ + $return = 0; + } + + # print the sequence to ambiguous.out if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG ">$identifier\n"; + print AMBIG "$sequence\n"; + } + + # print the sequence to file if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED ">$identifier\n"; + print UNMAPPED "$sequence\n"; + } + } + print "Processed $counting{sequences_count} sequences in total\n\n"; + + print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid); + +} + +sub process_single_end_fastQ_file_for_methylation_call{ + + my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; + + ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. + ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either + ### the C->T or G->A version + + ### gzipped version of the infile + if ($sequence_file =~ /\.gz$/){ + open (IN,"zcat $sequence_file |") or die $!; + } + else{ + open (IN,$sequence_file) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence file $sequence_file\n"; + while (1) { + my $identifier = ; + my $sequence = ; + my $identifier_2 = ; + my $quality_value = ; + last unless ($identifier and $sequence and $identifier_2 and $quality_value); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + chomp $sequence; + chomp $identifier; + chomp $quality_value; + + $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value); + } + else{ + $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1 + } + + unless ($return){ + $return = 0; + } + + # print the sequence to ambiguous.out if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG "\@$identifier\n"; + print AMBIG "$sequence\n"; + print AMBIG $identifier_2; + print AMBIG "$quality_value\n"; + } + + # print the sequence to file if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED "\@$identifier\n"; + print UNMAPPED "$sequence\n"; + print UNMAPPED $identifier_2; + print UNMAPPED "$quality_value\n"; + } + } + print "Processed $counting{sequences_count} sequences in total\n\n"; + + print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid); + +} + +sub process_fastA_files_for_paired_end_methylation_calls{ + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; + ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to + ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping. + ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the + ### converted genomes (either the C->T or G->A version) + + ### gzipped version of the infiles + if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ + open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; + open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; + } + else{ + open (IN1,$sequence_file_1) or die $!; + open (IN2,$sequence_file_2) or die $!; + } + + warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; + ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one + + my $count = 0; + + while (1) { + # reading from the first input file + my $identifier_1 = ; + my $sequence_1 = ; + # reading from the second input file + my $identifier_2 = ; + my $sequence_2 = ; + last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2); + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + $identifier_2 = fix_IDs($identifier_2); + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequence pairs so far\n"; + } + my $orig_identifier_1 = $identifier_1; + my $orig_identifier_2 = $identifier_2; + + chomp $sequence_1; + chomp $identifier_1; + chomp $sequence_2; + chomp $identifier_2; + + $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1); + } + else{ + $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1); + } + + unless ($return){ + $return = 0; + } + + # print the sequences to ambiguous_1 and _2 if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG_1 $orig_identifier_1; + print AMBIG_1 "$sequence_1\n"; + print AMBIG_2 $orig_identifier_2; + print AMBIG_2 "$sequence_2\n"; + } + + # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED_1 $orig_identifier_1; + print UNMAPPED_1 "$sequence_1\n"; + print UNMAPPED_2 $orig_identifier_2; + print UNMAPPED_2 "$sequence_2\n"; + } + } + + warn "Processed $counting{sequences_count} sequences in total\n\n"; + + close OUT or die $!; + + print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + +} + +sub process_fastQ_files_for_paired_end_methylation_calls{ + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; + ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to + ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments. + ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both) + ### of the converted genomes (either C->T or G->A version) + + ### gzipped version of the infiles + if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ + open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; + open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; + } + else{ + open (IN1,$sequence_file_1) or die $!; + open (IN2,$sequence_file_2) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; + ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one + while (1) { + # reading from the first input file + my $identifier_1 = ; + my $sequence_1 = ; + my $ident_1 = ; # not needed + my $quality_value_1 = ; # not needed + # reading from the second input file + my $identifier_2 = ; + my $sequence_2 = ; + my $ident_2 = ; # not needed + my $quality_value_2 = ; # not needed + last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2); + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + $identifier_2 = fix_IDs($identifier_2); + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequence pairs so far\n"; + } + + my $orig_identifier_1 = $identifier_1; + my $orig_identifier_2 = $identifier_2; + + chomp $sequence_1; + chomp $identifier_1; + chomp $sequence_2; + chomp $identifier_2; + chomp $quality_value_1; + chomp $quality_value_2; + + $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); + } + else{ + $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); + } + + unless ($return){ + $return = 0; + } + + # print the sequences to ambiguous_1 and _2 if --ambiguous was specified + if ($ambiguous and $return == 2){ + # seq_1 + print AMBIG_1 $orig_identifier_1; + print AMBIG_1 "$sequence_1\n"; + print AMBIG_1 $ident_1; + print AMBIG_1 "$quality_value_1\n"; + # seq_2 + print AMBIG_2 $orig_identifier_2; + print AMBIG_2 "$sequence_2\n"; + print AMBIG_2 $ident_2; + print AMBIG_2 "$quality_value_2\n"; + } + + # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified + elsif ($unmapped and $return == 1){ + # seq_1 + print UNMAPPED_1 $orig_identifier_1; + print UNMAPPED_1 "$sequence_1\n"; + print UNMAPPED_1 $ident_1; + print UNMAPPED_1 "$quality_value_1\n"; + # seq_2 + print UNMAPPED_2 $orig_identifier_2; + print UNMAPPED_2 "$sequence_2\n"; + print UNMAPPED_2 $ident_2; + print UNMAPPED_2 "$quality_value_2\n"; + } + } + + warn "Processed $counting{sequences_count} sequences in total\n\n"; + + close OUT or die $!; + + print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); + +} + +sub check_bowtie_results_single_end{ + my ($sequence,$identifier,$quality_value) = @_; + + unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout + $quality_value = 'I'x(length$sequence); + } + + my %mismatches = (); + ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome + foreach my $index (0..$#fhs){ + + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); + ### if the sequence we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + ############################################################### + ### STEP I Now processing the alignment stored in last_line ### + ############################################################### + my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier); + ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation + ### we only continue to extract useful information about this alignment if 1 was returned + if ($valid_alignment_found_1 == 1){ + ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself + ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) + my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; + + unless($mismatch_info){ + $mismatch_info = ''; + } + + chomp $mismatch_info; + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches; + if ($mismatch_info eq ''){ + $number_of_mismatches = 0; + } + elsif ($mismatch_info =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info); + $number_of_mismatches = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n"; + } + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + my $alignment_location = join (":",$chromosome,$position); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ + $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; + $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; + $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; + $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; + } + $number_of_mismatches = undef; + ################################################################################################################################################## + ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a + ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will + ### be returned as $valid_alignment_found and it will then be processed in the next round only. + ################################################################################################################################################## + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else { + # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier); + ### we only continue to extract useful information about this second alignment if 1 was returned + if ($valid_alignment_found_2 == 1){ + ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself + ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) + my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; + unless($mismatch_info){ + $mismatch_info = ''; + } + chomp $mismatch_info; + + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches; + if ($mismatch_info eq ''){ + $number_of_mismatches = 0; + } + elsif ($mismatch_info =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info); + $number_of_mismatches = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + ### extracting the chromosome number from the bowtie output (see above) + my $alignment_location = join (":",$chromosome,$position); + ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position + ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this + ### case we are not writing the same entry out a second time. + unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ + $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; + $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; + $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; + $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; + } + #################################################################################################################################### + #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ### + #################################################################################################################################### + $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + next; + } + else { + # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + ### still within the 2nd sequence in correct orientation found + } + ### still withing the 1st sequence in correct orientation found + } + ### still within the if (last_seq_id eq identifier) condition + } + ### still within foreach index loop + } + ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file + unless(%mismatches){ + $counting{no_single_alignment_found}++; + if ($unmapped){ + return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified + } + else{ + return; + } + } + ####################################################################################################################################################### + ####################################################################################################################################################### + ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### + ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### + ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ### + ####################################################################################################################################################### + ####################################################################################################################################################### + ### Going to use the variable $sequence_fails as a memory if a sequence could not be aligned uniquely (set to 1 then) + my $sequence_fails = 0; + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference! + ### sorting in ascending order + foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ + + ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment + if (scalar keys %{$mismatches{$mismatch_number}} == 1){ + for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position}; + $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; + } + } + elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){ + ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and + ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a + ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome + ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite + ### reaction. E.g. + ### CAGTCACGCGCGCGCG will become + ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition. + ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave + ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!: + ### G->A conversion: + ### highly methylated: CAATCACACACACACA + ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce + ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the + ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted + ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts. + ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions + ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of + ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment. + ### In the above example the number of transliterations required to transform the actual sequence + ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment) + ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments) + ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments + ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed + ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be + ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded. + my @three_candidate_seqs; + foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){ + my $transliterations_performed; + if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){ + $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT'); + } + elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){ + $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA'); + } + else{ + die "unexpected index number range $!\n"; + } + push @three_candidate_seqs,{ + index =>$mismatches{$mismatch_number}->{$composite_location}->{index}, + bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence}, + mismatch_number => $mismatch_number, + chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome}, + position => $mismatches{$mismatch_number}->{$composite_location}->{position}, + seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id}, + transliterations_performed => $transliterations_performed, + }; + } + ### sorting in ascending order for the lowest number of transliterations performed + @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs; + my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed}; + my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed}; + my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed}; + # print "$first_array_element\t$second_array_element\t$third_array_element\n"; + if (($first_array_element*2) < $second_array_element){ + $counting{low_complexity_alignments_overruled_count}++; + ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits + $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position}; + $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; + # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n"; + } + else{ + $sequence_fails = 1; + } + } + else{ + $sequence_fails = 1; + } + ### after processing the alignment with the lowest number of mismatches we exit + last; + } + ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions + if ($sequence_fails == 1){ + $counting{unsuitable_sequence_count}++; + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified + } + if ($unmapped){ + return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it will have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params); + + ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); + + print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value); + return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out +} + +sub check_bowtie_results_single_end_bowtie2{ + my ($sequence,$identifier,$quality_value) = @_; + + + unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout + $quality_value = 'I'x(length$sequence); + } + + # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name. + # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs + # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n"; + + my $alignment_ambiguous = 0; + my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far + my $amb_same_thread = 0; ## if a reads primary and secondary alignments have the same alignment score we set this to true. + + my %alignments = (); + + ### reading from the Bowtie 2 output filehandles + foreach my $index (0..$#fhs){ + # print "Index: $index\n"; + # print "$fhs[$index]->{last_line}\n"; + # print "$fhs[$index]->{last_seq_id}\n"; + # sleep (1); + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); + + ### if the sequence we are currently looking at produced an alignment we are doing various things with it + # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n"; + + if ($fhs[$index]->{last_seq_id} eq $identifier) { + # SAM format specifications for Bowtie 2 + # (1) Name of read that aligned + # (2) Sum of all applicable flags. Flags relevant to Bowtie are: + # 1 The read is one of a pair + # 2 The alignment is one end of a proper paired-end alignment + # 4 The read has no reported alignments + # 8 The read is one of a pair and has no reported alignments + # 16 The alignment is to the reverse reference strand + # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand + # 64 The read is mate 1 in a pair + # 128 The read is mate 2 in a pair + # 256 The read has multiple mapping states + # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) + # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) + # (5) Mapping quality (255 means MAPQ is not available) + # (6) CIGAR string representation of alignment (* if unavailable) + # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. + # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. + # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. + # (10) Read sequence (reverse-complemented if aligned to the reverse strand) + # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. + # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: + # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. + # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. + # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. + # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. + # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. + # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. + # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. + # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. + + my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10]; + + ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance + if ($flag == 4){ + ## reading in the next alignment, which must be the next sequence + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + if ($seq_id eq $identifier){ + die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + next; # next instance + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + } + + # if there are one or more proper alignments we can extract the chromosome number + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + + ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string + my ($alignment_score,$second_best,$MD_tag); + my @fields = split (/\t/,$fhs[$index]->{last_line}); + + foreach (11..$#fields){ + if ($fields[$_] =~ /AS:i:(.*)/){ + $alignment_score = $1; + } + elsif ($fields[$_] =~ /XS:i:(.*)/){ + $second_best = $1; + } + elsif ($fields[$_] =~ /MD:Z:(.*)/){ + $MD_tag = $1; + } + } + + if (!defined $best_AS_so_far){ + $best_AS_so_far = $alignment_score; + # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n"; + } + else{ + if ($alignment_score > $best_AS_so_far){ # AS are generally negative with a maximum of 0 + $best_AS_so_far = $alignment_score; + # warn "Found better alignment score ($alignment_score), setting \$best_AS_so_far to $best_AS_so_far\n"; + # resetting the ambiguous within thread memory (if applicable at all) + # warn "Resetting amb within thread value to 0\n"; + $amb_same_thread = 0; + } + else{ + # warn "current alignment (AS $alignment_score) isn't better than the best so far ($best_AS_so_far). Not changing anything\n"; + } + } + + # warn "First best alignment_score is: '$alignment_score'\n"; + # warn "MD tag is: '$MD_tag'\n"; + die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag) from line $fhs[$index]->{last_line}!\n" unless (defined $alignment_score and defined $MD_tag); + + if (defined $second_best){ + # warn "second best alignment_score is: '$second_best'\n\n"; + + # If the first alignment score is the same as the alignment score of the second best hit we keep a memory of this + if ($alignment_score == $second_best){ + + # checking to see if this read produced the best alignment + if ($alignment_score == $best_AS_so_far){ # yes this read is the best one so far, however it is ambiguous + # warn "Read is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n"; + $amb_same_thread = 1; + } + else{ + # warn "This read has a worse alignments score than the best alignment so far and will be ignored even though it is ambiguous in itself\n"; + } + ### if there is a better alignment later on -> fine. If not, the read will get booted altogether + + ## need to read and discard all additional ambiguous reads until we reach the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + last; # break free in case we have reached the end of the alignment output + } + } + # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + } + else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment + + my $alignment_location = join (":",$chromosome,$position); + + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id; + $alignments{$alignment_location}->{alignment_score} = $alignment_score; + $alignments{$alignment_location}->{alignment_score_second_best} = $second_best; + $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome; + $alignments{$alignment_location}->{position} = $position; + $alignments{$alignment_location}->{CIGAR} = $cigar; + $alignments{$alignment_location}->{MD_tag} = $MD_tag; + } + + ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + last; # break free in case we have reached the end of the alignment output + } + } + # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + } + } + else{ # there is no second best hit, so we can just store this one and read in the next sequence + + my $alignment_location = join (":",$chromosome,$position); + + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id; + $alignments{$alignment_location}->{alignment_score} = $alignment_score; + $alignments{$alignment_location}->{alignment_score_second_best} = undef; + $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome; + $alignments{$alignment_location}->{position} = $position; + $alignments{$alignment_location}->{MD_tag} = $MD_tag; + $alignments{$alignment_location}->{CIGAR} = $cigar; + } + + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + if ($seq_id eq $identifier){ + die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + } + } + } + } + + ### If there were several equally good alignments for the best alignment score we will boot the read + if ($amb_same_thread){ + # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; + $alignment_ambiguous = 1; + # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; + } + else{ + # warn "alignment won't be considered ambiguous. This time....\n"; + } + + ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out. + if ($alignment_ambiguous == 1){ + $counting{unsuitable_sequence_count}++; + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); + # print "$ambiguous_read_output\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; + } + } + + ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file + unless(%alignments){ + $counting{no_single_alignment_found}++; + # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value); + # print "$unmapped_read_output\n"; + if ($unmapped){ + return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified + } + else{ + return 0; # default + } + } + + ####################################################################################################################################################### + + ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one + ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) + ### alignment score we are discarding the sequence altogether. + ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for + ### opening (5) and extending (3 per bp) the gap. + + ####################################################################################################################################################### + + my $methylation_call_params; # hash reference which will store all information we need for the methylation call + my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + + ### print contents of %alignments for debugging + # if (scalar keys %alignments > 1){ + # print "\n******\n"; + # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ + # print "Loc: $alignment_location\n"; + # print "ID: $alignments{$alignment_location}->{seq_id}\n"; + # print "AS: $alignments{$alignment_location}->{alignment_score}\n"; + # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n"; + # print "Index $alignments{$alignment_location}->{index}\n"; + # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; + # print "pos: $alignments{$alignment_location}->{position}\n"; + # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n"; + # } + # print "\n******\n"; + # } + + ### if there is only 1 entry in the hash with we accept it as the best alignment + if (scalar keys %alignments == 1){ + for my $unique_best_alignment (keys %alignments){ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position}; + $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score}; + $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$unique_best_alignment}->{alignment_score_second_best}; + $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag}; + $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR}; + } + } + + ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case + ### we boot the sequence altogether + elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ + my $best_alignment_score; + my $best_alignment_location; + foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){ + # print "$alignments{$alignment_location}->{alignment_score}\n"; + unless (defined $best_alignment_score){ + $best_alignment_score = $alignments{$alignment_location}->{alignment_score}; + $best_alignment_location = $alignment_location; + # print "setting best alignment score: $best_alignment_score\n"; + } + else{ + ### if the second best alignment has the same alignment score as the first one, the sequence will get booted + if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){ + # warn "Same alignment score, the sequence will get booted!\n"; + $sequence_fails = 1; + last; # exiting after the second alignment since we know that the sequence has ambiguous alignments + } + ### else we are going to store the best alignment for further processing + else{ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position}; + $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; + $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score}; + $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag}; + $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR}; + if (defined $alignments{$best_alignment_location}->{alignment_score_second_best} and $alignments{$best_alignment_location}-> {alignment_score_second_best} > $alignments{$alignment_location}->{alignment_score}) { + $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$best_alignment_location}->{alignment_score_second_best}; + } + else { + $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$alignment_location}->{alignment_score}; + } + last; # exiting after processing the second alignment since the sequence produced a unique best alignment + } + } + } + } + else{ + die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";; + } + + ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions + if ($sequence_fails == 1){ + $counting{unsuitable_sequence_count}++; + + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); + # print OUT "$ambiguous_read_output\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it has a unique best alignment + $counting{unique_best_alignment_count}++; + + ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well + extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params); + + ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + # Compute MAPQ value + $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence), undef, + $methylation_call_params->{$identifier}->{alignment_score}, + $methylation_call_params->{$identifier}->{alignment_score_second_best}); + + + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); + print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value); + return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out +} + + +sub determine_number_of_transliterations_performed{ + my ($sequence,$read_conversion) = @_; + my $number_of_transliterations; + if ($read_conversion eq 'CT'){ + $number_of_transliterations = $sequence =~ tr/C/T/; + } + elsif ($read_conversion eq 'GA'){ + $number_of_transliterations = $sequence =~ tr/G/A/; + } + else{ + die "Read conversion mode of the read was not specified $!\n"; + } + return $number_of_transliterations; +} + +sub decide_whether_single_end_alignment_is_valid{ + my ($index,$identifier) = @_; + + # extracting from Bowtie 1 format + my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1]; + + ### ensuring that the entry is the correct sequence + if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){ + ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically + ### sensible alignments + my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); + ### If the orientation was correct can we move on + if ($orientation == 1){ + return 1; ### 1st possibility for a sequence to pass + } + ### If the alignment was in the wrong orientation we need to read in a new line + elsif($orientation == 0){ + my $newline = $fhs[$index]->{fh}->getline(); + if ($newline){ + ($id,$strand) = (split (/\t/,$newline))[0,1]; + + ### ensuring that the next entry is still the correct sequence + if ($id eq $identifier){ + ### checking orientation again + $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); + ### If the orientation was correct can we move on + if ($orientation == 1){ + $fhs[$index]->{last_seq_id} = $id; + $fhs[$index]->{last_line} = $newline; + return 1; ### 2nd possibility for a sequence to pass + } + ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs + elsif ($orientation == 0){ + $newline = $fhs[$index]->{fh}->getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with + ### the same fields of the just read next entry + die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation + } + else{ + # assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs + else{ + $fhs[$index]->{last_seq_id} = $id; + $fhs[$index]->{last_line} = $newline; + return 0; # processing the new alignment result only in the next round + } + } + else { + # assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round + else{ + return 0; + } +} +######################### +### BOWTIE 1 | PAIRED-END +######################### + +sub check_bowtie_results_paired_ends{ + my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; + + ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 + unless ($quality_value_1){ + $quality_value_1 = 'I'x(length$sequence_1); + } + unless ($quality_value_2){ + $quality_value_2 = 'I'x(length$sequence_2); + } + + # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; + # sleep (1); + my %mismatches = (); + ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome + + + ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. + ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). + ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) + ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary + ### strands are not being reported by specifying --directional + + foreach my $index (0,3,1,2){ + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); + ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n"; + + ################################################################################## + ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ### + ################################################################################## + my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); + ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong + ### orientation. We only continue to extract useful information about this alignment if 1 was returned + if ($valid_alignment_found == 1){ + ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself. + ### we store the useful information in %mismatches + my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; + my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + + ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted + my ($chromosome_1,$chromosome_2); + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches_1; + my $number_of_mismatches_2; + if ($mismatch_info_1 eq ''){ + $number_of_mismatches_1 = 0; + } + elsif ($mismatch_info_1 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_1); + $number_of_mismatches_1 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + if ($mismatch_info_2 eq ''){ + $number_of_mismatches_2 = 0; + } + elsif ($mismatch_info_2 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_2); + $number_of_mismatches_2 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments + my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + die "Position 1 is higher than position 2" if ($position_1 > $position_2); + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ + $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; + } + ################################################################################################################################################### + ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ### + ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ### + ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ### + ### this round ### + ################################################################################################################################################### + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_2; + } + else{ + die "Either read 1 or read 2 needs to end on '/1'\n"; + } + + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else { + # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; # jumping to the next index + } + ### Now processing the entry we just stored in last_line_1 and last_line_2 + $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); + ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to + ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation + if ($valid_alignment_found == 1){ + ### we store the useful information in %mismatches + ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7]; + ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted) + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + $number_of_mismatches_1=''; + $number_of_mismatches_2=''; + ### Now extracting the number of mismatches to the converted genome + if ($mismatch_info_1 eq ''){ + $number_of_mismatches_1 = 0; + } + elsif ($mismatch_info_1 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_1); + $number_of_mismatches_1 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + if ($mismatch_info_2 eq ''){ + $number_of_mismatches_2 = 0; + } + elsif ($mismatch_info_2 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_2); + $number_of_mismatches_2 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments + $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + die "position 1 is greater than position 2" if ($position_1 > $position_2); + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ + $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; + } + ############################################################################################################################################### + ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ### + ############################################################################################################################################### + $newline_1 = $fhs[$index]->{fh}-> getline(); + $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_1; + } + if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_2; + } + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else { + # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; # jumping to the next index + } + ### within the 2nd sequence pair alignment in correct orientation found + } + ### within the 1st sequence pair alignment in correct orientation found + } + ### still within the (last_seq_id eq identifier) condition + } + ### still within foreach index loop + } + ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file + unless(%mismatches){ + $counting{no_single_alignment_found}++; + return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified + } + ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + my $sequence_pair_fails = 0; + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference! + ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the + ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the + ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether + foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ + #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n"; + foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){ + #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n"; + # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n"; + } + if (scalar keys %{$mismatches{$mismatch_number}} == 1){ + # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n"; + for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ + $methylation_call_params->{$identifier}->{seq_id} = $identifier; + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1}; + $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}; + $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2})); + $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1}; + $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2}; + } + } + else{ + $sequence_pair_fails = 1; + } + ### after processing the alignment with the lowest number of mismatches we exit + last; + } + ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions + if ($sequence_pair_fails == 1){ + $counting{unsuitable_sequence_count}++; + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified + } + if ($unmapped){ + return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it does have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params); + + ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); + $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); + + print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); + return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 +} + +######################### +### BOWTIE 2 | PAIRED-END +######################### + +sub check_bowtie_results_paired_ends_bowtie2{ + my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; + + ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 + unless ($quality_value_1){ + $quality_value_1 = 'I'x(length$sequence_1); + } + + unless ($quality_value_2){ + $quality_value_2 = 'I'x(length$sequence_2); + } + + + # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; + + + my %alignments; + my $alignment_ambiguous = 0; + my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far + my $amb_same_thread = 0; ## if a reads primary and secondary alignments have the same alignment score we set this to true. + + ### reading from the Bowtie 2 output filehandles + + ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. + ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). + ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) + ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary + ### strands are not being reported when '--directional' is specified + + foreach my $index (0,3,1,2){ + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); + + ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + + my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10]; + my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10]; + # print "Index: $index\t$fhs[$index]->{last_line_1}\n"; + # print "Index: $index\t$fhs[$index]->{last_line_2}\n"; + # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n"; + # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n"; + $id_1 =~ s/\/1$//; + $id_2 =~ s/\/2$//; + + # SAM format specifications for Bowtie 2 + # (1) Name of read that aligned + # (2) Sum of all applicable flags. Flags relevant to Bowtie are: + # 1 The read is one of a pair + # 2 The alignment is one end of a proper paired-end alignment + # 4 The read has no reported alignments + # 8 The read is one of a pair and has no reported alignments + # 16 The alignment is to the reverse reference strand + # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand + # 64 The read is mate 1 in a pair + # 128 The read is mate 2 in a pair + # 256 The read has multiple mapping states + # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) + # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) + # (5) Mapping quality (255 means MAPQ is not available) + # (6) CIGAR string representation of alignment (* if unavailable) + # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. + # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. + # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. + # (10) Read sequence (reverse-complemented if aligned to the reverse strand) + # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. + # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: + # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. + # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. + # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. + # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. + # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. + # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. + # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. + # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. + + ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128). + ### We can store the next alignment and move on to the next Bowtie 2 instance + if ($flag_1 == 77 and $flag_2 == 141){ + ## reading in the next alignment, which must be the next sequence + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + + # print "current sequence ($identifier) did not map, reading in next sequence\n"; + # print "$index\t$fhs[$index]->{last_seq_id}\n"; + # print "$index\t$fhs[$index]->{last_line_1}\n"; + # print "$index\t$fhs[$index]->{last_line_2}\n"; + next; # next instance + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; + } + } + + ### If there are one or more proper alignments we can extract the chromosome number + my ($chromosome_1,$chromosome_2); + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + + ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string + my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2); + + my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1}); + my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2}); + + foreach (11..$#fields_1){ + if ($fields_1[$_] =~ /AS:i:(.*)/){ + $alignment_score_1 = $1; + } + elsif ($fields_1[$_] =~ /XS:i:(.*)/){ + $second_best_1 = $1; + } + elsif ($fields_1[$_] =~ /MD:Z:(.*)/){ + $MD_tag_1 = $1; + } + } + + foreach (11..$#fields_2){ + if ($fields_2[$_] =~ /AS:i:(.*)/){ + $alignment_score_2 = $1; + } + elsif ($fields_2[$_] =~ /XS:i:(.*)/){ + $second_best_2 = $1; + } + elsif ($fields_2[$_] =~ /MD:Z:(.*)/){ + $MD_tag_2 = $1; + } + } + + die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1); + die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2); + + # warn "First read 1 alignment score is: '$alignment_score_1'\n"; + # warn "First read 2 alignment score is: '$alignment_score_2'\n"; + # warn "MD tag 1 is: '$MD_tag_1'\n"; + # warn "MD tag 2 is: '$MD_tag_2'\n"; + + ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments + my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ; + # warn "sum of alignment scores: $sum_of_alignment_scores_1\n\n"; + + if (!defined $best_AS_so_far){ + $best_AS_so_far = $sum_of_alignment_scores_1; + # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n"; + } + else{ + if ($sum_of_alignment_scores_1 > $best_AS_so_far){ # AS are generally negative with a maximum of 0 + $best_AS_so_far = $sum_of_alignment_scores_1; + # warn "Found better sum of alignment scores ($sum_of_alignment_scores), setting \$best_AS_so_far to $best_AS_so_far\n"; + # resetting the ambiguous within thread memory (if applicable at all) + # warn "Resetting amb within thread value to 0\n"; + $amb_same_thread = 0; + } + else{ + # warn "current alignment (AS $sum_of_alignment_scores) isn't better than the best so far ($best_AS_so_far). Not changing anything\n"; + } + } + + if (defined $second_best_1 and defined $second_best_2){ + my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2; + # warn "Second best alignment_score_1 is: '$second_best_1'\n"; + # warn "Second best alignment_score_2 is: '$second_best_2'\n"; + # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n"; + + # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we we keep a memory of this + if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){ + + # checking to see if this read pair produced the best alignment + if ($sum_of_alignment_scores_1 == $best_AS_so_far){ # yes this is the best read pair so far, either within the thread or between threads, however it is ambiguous + # warn "Read pair is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n"; + $amb_same_thread = 1; + } + else{ + # warn "This read pair has a worse alignment score than the best alignment so far and will be ignored even though it is ambiguous in itself\n"; + } + + ### if there is a better alignment later on -> fine. If not, the read will get booted altogether one way or another + + ## need to read and discard all additional ambiguous reads until we reach the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + last; # break free if the end of the alignment output was reached + } + } + # if ($fhs[$index]->{last_seq_id}){ + # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + # } + } + else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment + + my $alignment_location; + if ($position_1 <= $position_2){ + $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + } + elsif($position_2 < $position_1){ + $alignment_location = join(":",$chromosome_1,$position_2,$position_1); + } + + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id_1; + $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; + $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; + $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; + $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = $sum_of_alignment_scores_second_best; + $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; + $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine + $alignments{$alignment_location}->{position_1} = $position_1; + $alignments{$alignment_location}->{position_2} = $position_2; + $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; + $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; + $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; + $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; + $alignments{$alignment_location}->{flag_1} = $flag_1; + $alignments{$alignment_location}->{flag_2} = $flag_2; + } + # warn "added best of several alignments to \%alignments hash\n"; + + ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else{ + # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + last; # break free if the end of the alignment output was reached + } + } + # if($fhs[$index]->{last_seq_id}){ + # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n"; + # } + } + } + else{ # there is no second best hit, so we can just store this one and read in the next sequence + + my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + # print "$alignment_location\n"; + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id_1; + $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; + $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; + $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; + $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = undef; + $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; + $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine + $alignments{$alignment_location}->{position_1} = $position_1; + $alignments{$alignment_location}->{position_2} = $position_2; + $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; + $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; + $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; + $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; + $alignments{$alignment_location}->{flag_1} = $flag_1; + $alignments{$alignment_location}->{flag_2} = $flag_2; + } + + # warn "added unique alignment to \%alignments hash\n"; + + # Now reading and storing the next read pair + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + # print "$newline_1\n"; + # print "$newline_2\n"; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + + if ($seq_id_1 eq $identifier){ + die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + } + else{ + # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + } + } + } + } + + ### If there were several equally good alignments for the best alignment score we will boot the read + if ($amb_same_thread){ + # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; + $alignment_ambiguous = 1; + # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; + } + else{ + # warn "alignment won't be considered ambiguous. This time....\n"; + } + + + ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format + if ($alignment_ambiguous == 1){ + $counting{unsuitable_sequence_count}++; + ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$ambiguous_read_1\n"; + # print "$ambiguous_read_2\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; + } + } + + ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file + unless (%alignments){ + $counting{no_single_alignment_found}++; + + # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$unmapped_read_1\n"; + # print "$unmapped_read_2\n"; + if ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified + } + else{ + return 0; + } + } + + ####################################################################################################################################################### + + ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one + ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) + ### alignment score we are discarding the sequence pair altogether. + ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5) + ### and extending (3 per bp) the gap. + + ####################################################################################################################################################### + + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference + my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + + ### print contents of %alignments for debugging + ## if (scalar keys %alignments >= 1){ + # print "\n******\n"; + # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ + # print "Loc: $alignment_location\n"; + # print "ID: $alignments{$alignment_location}->{seq_id}\n"; + # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n"; + # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n"; + # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n"; + # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n"; + # print "Index $alignments{$alignment_location}->{index}\n"; + # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; + # print "Pos_1: $alignments{$alignment_location}->{position_1}\n"; + # print "Pos_2: $alignments{$alignment_location}->{position_2}\n"; + # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n"; + # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n"; + # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n"; + # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n"; + # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n"; + # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n"; + # } + # print "\n******\n"; + # } + + ### if there is only 1 entry in the %alignments hash we accept it as the best alignment + if (scalar keys %alignments == 1){ + for my $unique_best_alignment (keys %alignments){ + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1}; + $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2}; + $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1}; + $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores_second_best}; + $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1}; + $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2}; + $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1}; + $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2}; + $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1}; + $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2}; + } + } + + ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case + ### we boot the sequence pair altogether) + elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ + my $best_sum_of_alignment_scores; + my $best_alignment_location; + foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){ + + # warn "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; sleep(1); + + unless (defined $best_sum_of_alignment_scores){ + $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores}; + $best_alignment_location = $alignment_location; + # print "setting best alignment score to: $best_sum_of_alignment_scores\n"; + } + else{ + ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted + if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){ + # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n"; + $sequence_pair_fails = 1; + last; # exiting since we know that the sequence has ambiguous alignments + } + ### else we are going to store the best alignment for further processing + else{ + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; + $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1}; + $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2}; + $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; + $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1}; + $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores}; + $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1}; + $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2}; + $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1}; + $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2}; + $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1}; + $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2}; + + if (defined $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} and ( $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} > $alignments{$alignment_location}->{sum_of_alignment_scores} )) { + $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best}; + } + else { + $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$alignment_location}->{sum_of_alignment_scores}; + } + + last; # exiting since the sequence produced a unique best alignment + } + } + } + } + else{ + die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";; + } + + ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions + if ($sequence_pair_fails == 1){ + $counting{unsuitable_sequence_count}++; + + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # warn "$ambiguous_read_1\n"; + # warn "$ambiguous_read_2\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; # => exits to next sequence pair (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence pair has not been rejected so far it does have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); + + ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### Compute MAPQ value + $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence_1), length($sequence_2), + $methylation_call_params->{$identifier}->{sum_of_alignment_scores}, + $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best}); + + + ### now we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); + $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); + # warn "$methylation_call_params->{$identifier}->{read_conversion_2}\n"; + # warn " $sequence_2\n"; + # warn "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n"; + # warn " $methylation_call_params->{$identifier}->{methylation_call_2}\n"; + + print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); + return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 +} + +### + +# Compute MAPQ value for a read or read pair as in Bowtie2-2.2.2 (specifically, V2 of the MAPQ calculator: "class BowtieMapq2") +# assuming end-to-end alignment with the default calculation of the minimum alignment score + +sub calc_mapq { + my ($read1Len, $read2Len, $AS_best, $AS_secBest) = @_; + + my $scMin = $score_min_intercept + $score_min_slope * $read1Len; + ### read2Len is only defined for paired-end reads, so for single-end mode we can just a score min value for read 1 + if (defined $read2Len){ + $scMin += $score_min_intercept + $score_min_slope * $read2Len; + } + + my $diff = abs$scMin; # scores can vary by up to this much (since max AS is 0 for end-to-end alignment) + my $bestOver = $AS_best - $scMin; + + if (!defined $AS_secBest) { + if ($bestOver >= $diff * 0.8) { return 42; } + elsif ($bestOver >= $diff * 0.7) { return 40; } + elsif ($bestOver >= $diff * 0.6) { return 24; } + elsif ($bestOver >= $diff * 0.5) { return 23; } + elsif ($bestOver >= $diff * 0.4) { return 8; } + elsif ($bestOver >= $diff * 0.3) { return 3; } + else { return 0; } + } else { + my $bestDiff = abs(abs($AS_best) - abs($AS_secBest)); + if ($bestDiff >= $diff * 0.9) { + if ($bestOver == $diff) { + return 39; + } else { + return 33; + } + } elsif ($bestDiff >= $diff * 0.8) { + if ($bestOver == $diff) { + return 38; + } else { + return 27; + } + } elsif ($bestDiff >= $diff * 0.7) { + if ($bestOver == $diff) { + return 37; + } else { + return 26; + } + } elsif ($bestDiff >= $diff * 0.6) { + if ($bestOver == $diff) { + return 36; + } else { + return 22; + } + } elsif ($bestDiff >= $diff * 0.5) { + if ($bestOver == $diff) { + return 35; + } elsif ($bestOver >= $diff * 0.84) { + return 25; + } elsif ($bestOver >= $diff * 0.68) { + return 16; + } else { + return 5; + } + } elsif ($bestDiff >= $diff * 0.4) { + if ($bestOver == $diff) { + return 34; + } elsif ($bestOver >= $diff * 0.84) { + return 21; + } elsif ($bestOver >= $diff * 0.68) { + return 14; + } else { + return 4; + } + } elsif ($bestDiff >= $diff * 0.3) { + if ($bestOver == $diff) { + return 32; + } elsif ($bestOver >= $diff * 0.88) { + return 18; + } elsif ($bestOver >= $diff * 0.67) { + return 15; + } else { + return 3; + } + } elsif ($bestDiff >= $diff * 0.2) { + if ($bestOver == $diff) { + return 31; + } elsif ($bestOver >= $diff * 0.88) { + return 17; + } elsif ($bestOver >= $diff * 0.67) { + return 11; + } else { + return 0; + } + } elsif ($bestDiff >= $diff * 0.1) { + if ($bestOver == $diff) { + return 30; + } elsif ($bestOver >= $diff * 0.88) { + return 12; + } elsif ($bestOver >= $diff * 0.67) { + return 7; + } else { + return 0; + } + } elsif ($bestDiff > 0) { + if ($bestOver >= $diff * 0.67) { + return 6; + } else { + return 2; + } + } else { + if ($bestOver >= $diff * 0.67) { + return 1; + } else { + return 0; + } + } + } +} + + +### + +sub decide_whether_paired_end_alignment_is_valid{ + my ($index,$identifier) = @_; + my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; + my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + my $seq_id_1 = $id_1; + my $seq_id_2 = $id_2; + $seq_id_1 =~ s/\/1$//; # removing the read /1 + $seq_id_2 =~ s/\/1$//; # removing the read /1 + + ### ensuring that the current entry is the correct sequence + if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ + ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically + ### sensible alignments + my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); + ### If the orientation was correct can we move on + if ($orientation == 1){ + return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS + } + ### If the alignment was in the wrong orientation we need to read in two new lines + elsif($orientation == 0){ + my $newline_1 = $fhs[$index]->{fh}->getline(); + my $newline_2 = $fhs[$index]->{fh}->getline(); + if ($newline_1 and $newline_2){ + ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time) + ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1]; + ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1]; + + my $seqid; + $seq_id_1 = $id_1; + $seq_id_2 = $id_2; + # we need to capture the first read (ending on /1) + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_2; + } + else{ + die "One of the two reads needs to end on /1!!"; + } + + ### ensuring that the next entry is still the correct sequence + if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ + ### checking orientation again + $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); + ### If the orientation was correct can we move on + if ($orientation == 1){ + ### Writing the current sequence to last_line_1 and last_line_2 + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS + } + ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be + ### the next entry) + elsif ($orientation == 0){ + $newline_1 = $fhs[$index]->{fh}->getline(); + $newline_2 = $fhs[$index]->{fh}->getline(); + if ($newline_1 and $newline_2){ + ($seq_id_1) = split (/\t/,$newline_1); + ($seq_id_2) = split (/\t/,$newline_2); + + $seqid = ''; + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_2; + } + else{ + die "One of the two reads needs to end on /1!!"; + } + + ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with + ### the same fields of the just read next entry + die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier); + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + else { + ### assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs + else{ + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 0; # processing the new alignment result only in the next round + } + } + else { + # assigning undef to last_seq_id and both last_lines (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round + else{ + return 0; + } +} + +### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END + +sub extract_corresponding_genomic_sequence_paired_ends { + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + my $alignment_read_1; + my $alignment_read_2; + my $read_conversion_info_1; + my $read_conversion_info_2; + my $genome_conversion; + + ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call + ### if the C happens to be at the first or last position of the actually observed sequence + my $non_bisulfite_sequence_1; + my $non_bisulfite_sequence_2; + + ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was + ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic + ### sequences around! + ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_GA_CT_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'CT'; + ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) + ### for hits on the forward strand we need to capture 2 extra bases at the 3' end + + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) + ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1 + + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + else{ + $non_bisulfite_sequence_2 = ''; + } + } + + ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from complementary to (converted) reverse strand] + $counting{GA_CT_GA_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'GA'; + + ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) + ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end + if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1 + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2 + } + else{ + $non_bisulfite_sequence_1 = ''; + } + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) + ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + + ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from the complementary to (converted) forward strand] + $counting{GA_CT_CT_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'CT'; + + ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! + ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand + ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) + ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! + ### Read 2 is CT converted so we need to capture 2 extra 3' bases + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1 + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2 + } + else{ + $non_bisulfite_sequence_2 = ''; + } + } + + ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from the (converted) reverse strand] + $counting{CT_GA_GA_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'GA'; + + ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! + ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand + ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation + if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1 + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + } + else{ + $non_bisulfite_sequence_1 = ''; + } + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) + ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! + ### Read 2 is GA converted so we need to capture 2 extra 5' bases + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2 + } + else{ + die "Too many bowtie result filehandles\n"; + } + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + + $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; + $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; + $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; +} + +### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END + +sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{ + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; + my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; + my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; + my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; + + my $contains_deletion_1 = 0; + my $contains_deletion_2 = 0; + if ($cigar_1 =~ /D/){ + $contains_deletion_1 = 1; + if ($verbose){ warn "$cigar_1\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_1}\n";} + } + if ($cigar_2 =~ /D/){ + $contains_deletion_2 = 1; + if ($verbose){ warn "$cigar_2\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_2}\n";} + } + + # warn "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; + ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_read_1; + my $alignment_read_2; + my $read_conversion_info_1; + my $read_conversion_info_2; + my $genome_conversion; + + ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call + ### if the C happens to be at the last position of the actually observed sequence + my $non_bisulfite_sequence_1 = ''; + my $non_bisulfite_sequence_2 = ''; + my $genomic_seq_for_MD_tag_1 = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output + my $genomic_seq_for_MD_tag_2 = ''; + + ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings + my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1; + my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1; + + # parsing CIGAR 1 string + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); + # parsing CIGAR 2 string + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + + my $indels_1 = 0; # adding these to the hemming distance value (needed for the NM field in the final SAM output + my $indels_2 = 0; + + ### Extracting read 1 genomic sequence ### + + # extracting 2 additional bp at the 5' end (read 1) + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1; + return; + } + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2); + } + + foreach (0..$#len_1){ + if ($ops_1[$_] eq 'M'){ + # extracting genomic sequence + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); + } + # warn "$non_bisulfite_sequence_1\n"; + # adjusting position + $pos_1 += $len_1[$_]; + } + elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls, and we can later ignore it for the generation of the MD;Z: tag + $non_bisulfite_sequence_1 .= 'X' x $len_1[$_]; + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 .= 'X' x $len_1[$_]; + } + # warn "$non_bisulfite_sequence_1\n"; + # position doesn't need adjusting + + ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist() + # indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); + } + # warn "Just adjusting the position by: ",$len_1[$_],"bp\n"; + $pos_1 += $len_1[$_]; + $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + else{ + die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + } + + ### 3' end of read 1 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + return; + } + + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); + } + + + ### Extracting read 2 genomic sequence ### + + ### 5' end of read 2 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2; + return; + } + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2); + } + + foreach (0..$#len_2){ + if ($ops_2[$_] eq 'M'){ + # extracting genomic sequence + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); + } + # warn "$non_bisulfite_sequence_2\n"; + # adjusting position + $pos_2 += $len_2[$_]; + } + elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can ignore this later during the generation of the MD:Z: tag + $non_bisulfite_sequence_2 .= 'X' x $len_2[$_]; + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 .= 'X' x $len_2[$_]; + } + # warn "$non_bisulfite_sequence_2\n"; + # position doesn't need adjusting + + ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist() + # $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); + } + # warn "Just adjusting the position by: ",$len_2[$_],"bp\n"; + $pos_2 += $len_2[$_]; + $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + else{ + die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + } + + ### 3' end of read 2 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise + # need to set read 1 as well now to prevent warning + # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; + # sleep(5); + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + return; + } + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); + } + + ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was + ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly + + ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_GA_CT_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'CT'; + ### Read 1 is always the forward hit + ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2); + } + } + + ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from complementary to (converted) bottom strand] + $counting{GA_CT_GA_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'GA'; + ### Read 1 is always the forward hit + ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + if ($contains_deletion_2){ + $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2); + } + } + + ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from the complementary to (converted) top strand] + $counting{GA_CT_CT_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'CT'; + + ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1); + } + } + + ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from the (converted) reverse strand] + $counting{CT_GA_GA_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'GA'; + ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + if ($contains_deletion_1){ + $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1); + } + } + else{ + die "Too many bowtie result filehandles\n"; + } + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + + $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; + $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; + $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2; + + ## the end position of a read is stored in $pos + $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1; + $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2; + $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1; + $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2; +} + +########################################## +### PRINT SINGLE END RESULTS: Bowtie 1 ### +########################################## + +sub print_bisulfite_mapping_result_single_end{ + my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value = convert_phred64_quals_to_phred33($quality_value); + } + elsif ($solexa){ + $quality_value = convert_solexa_quals_to_phred33($quality_value); + } + + ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position. + $methylation_call_params->{$identifier}->{position} += 1; + + ### writing every uniquely mapped read and its methylation call to the output file + if ($vanilla){ + my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value); + print OUT "$bowtie1_output\n"; + } + else{ # SAM output, default since Bismark v1.0.0 + single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script + } +} + +########################################## +### PRINT SINGLE END RESULTS: Bowtie 2 ### +########################################## + +sub print_bisulfite_mapping_result_single_end_bowtie2{ + my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value = convert_phred64_quals_to_phred33($quality_value); + } + elsif ($solexa){ + $quality_value = convert_solexa_quals_to_phred33($quality_value); + } + + ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed) + single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script +} + +########################################## +### PRINT PAIRED END ESULTS: Bowtie 1 ### +########################################## + +sub print_bisulfite_mapping_results_paired_ends{ + my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); + } + elsif ($solexa){ + $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); + } + + ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based) + $methylation_call_params->{$identifier}->{start_seq_1} += 1; + + ### writing every single aligned read and its methylation call to the output file + if ($vanilla){ + my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2); + print OUT "$bowtie1_output_paired_end\n"; + } + else{ # SAM output, default since Bismark v1.0.0 + paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script + } + +} + +########################################## +### PRINT PAIRED END ESULTS: Bowtie 2 ### +########################################## + +sub print_bisulfite_mapping_results_paired_ends_bowtie2{ + my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); + } + elsif ($solexa){ + $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); + } + + ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed) + paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script + +} + + +sub convert_phred64_quals_to_phred33{ + + my $qual = shift; + my @quals = split (//,$qual); + my @new_quals; + + foreach my $index (0..$#quals){ + my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]); + my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); + $new_quals[$index] = $phred33_quality_string; + } + + my $phred33_quality = join ("",@new_quals); + return $phred33_quality; +} + +sub convert_solexa_quals_to_phred33{ + + my $qual = shift; + my @quals = split (//,$qual); + my @new_quals; + + foreach my $index (0..$#quals){ + my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]); + my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); + $new_quals[$index] = $phred33_quality_string; + } + + my $phred33_quality = join ("",@new_quals); + return $phred33_quality; +} + +sub convert_phred_score_into_phred33_quality_string{ + my $qual = shift; + $qual = chr($qual+33); + return $qual; +} + +sub convert_phred64_quality_string_into_phred_score{ + my $string = shift; + my $qual = ord($string)-64; + return $qual; +} + +sub convert_solexa_pre1_3_quality_string_into_phred_score{ + ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10 + my $string = shift; + my $qual = ord($string)-59; + return $qual; +} + + +sub extract_corresponding_genomic_sequence_single_end { + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_strand; + my $read_conversion_info; + my $genome_conversion; + ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, + ### if the C happens to be at the last position of the actually observed sequence + my $non_bisulfite_sequence; + ### depending on the conversion we want to make need to capture 1 extra base at the 3' end + + my $pbat_index_modifier = 0; + + if ($pbat){ + $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1! + } + + ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) + if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_CT_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'CT'; + $genome_conversion = 'CT'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 + ### + 2 extra base at the 3' end + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){ + ### [Index 1, sequence originated from (converted) reverse strand] + $counting{CT_GA_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'CT'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > + ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){ + ### [Index 2, sequence originated from complementary to (converted) forward strand] + $counting{GA_CT_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'GA'; + $genome_conversion = 'CT'; + + ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){ + ### [Index 3, sequence originated from complementary to (converted) reverse strand] + $counting{GA_GA_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'GA'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > + ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + else{ + die "Too many bowtie result filehandles\n"; + } + + $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; + $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + + ### at this point we can also determine the end position of a read + $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); +} + + +sub extract_corresponding_genomic_sequence_single_end_bowtie2{ + my ($sequence_identifier,$methylation_call_params) = @_; + + my $MD_tag = $methylation_call_params->{$sequence_identifier}->{MD_tag}; + my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR}; + + my $contains_deletion = 0; + if ($cigar =~ /D/){ + $contains_deletion = 1; + # warn "$cigar\n$MD_tag\n"; + } + ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_strand; + my $read_conversion_info; + my $genome_conversion; + + ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence + my $non_bisulfite_sequence = ''; + my $genomic_seq_for_MD_tag = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output + + ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings + my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my $pbat_index_modifier = 0; + + if ($pbat){ + $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1! + } + + ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3) + if ( (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1) or (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; + return; + } + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2); + } + + my $indels = 0; + + foreach (0..$#len){ + if ($ops[$_] eq 'M'){ + #extracting genomic sequence + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); + if ($contains_deletion){ + $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); + } + # adjusting position + $pos += $len[$_]; + } + elsif ($ops[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can later ignore it better during the generation of the MD:Z-tag + $non_bisulfite_sequence .= 'X' x $len[$_]; + if ($contains_deletion){ + $genomic_seq_for_MD_tag .= 'X' x $len[$_]; + } + # warn "$non_bisulfite_sequence\n"; + # position doesn't need to be adjusting + + ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail the base by base comparison in hemming_dist() + # $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) + } + elsif ($ops[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + + # we do however add the genomic sequence to the $genomic_sequence for MD-tag determination if the CIGAR string contained a deletion + if ($contains_deletion){ + $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); + } + $pos += $len[$_]; + $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) + } + elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + else{ + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + } + + ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2) + if ( ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0) or ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; + return; + } + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2); + # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n"; + } + + + ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) + if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_CT_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'CT'; + $genome_conversion = 'CT'; + } + + ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){ + ### [Index 1, sequence originated from (converted) reverse strand] + $counting{CT_GA_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'CT'; + $genome_conversion = 'GA'; + + ### reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + if ($contains_deletion){ + $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag); + } + } + + ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){ + ### [Index 2, sequence originated from complementary to (converted) forward strand] + $counting{GA_CT_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'GA'; + $genome_conversion = 'CT'; + + ### reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + if ($contains_deletion){ + $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag); + } + } + + ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) + elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){ + ### [Index 3, sequence originated from complementary to (converted) reverse strand] + $counting{GA_GA_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'GA'; + $genome_conversion = 'GA'; + + } + else{ + die "Too many Bowtie 2 result filehandles\n"; + } + + $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; + $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; + + # if ($contains_deletion){ + # warn "non-bis: $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence}\n"; + # warn "MD-seq: $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag}\n"; + # } + + ### the end position of a read is stored in $pos + $methylation_call_params->{$sequence_identifier}->{end_position} = $pos; + $methylation_call_params->{$sequence_identifier}->{indels} = $indels; +} + +### METHYLATION CALL + +sub methylation_call{ + my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_; + ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one + my @seq = split(//,$sequence_actually_observed); + my @genomic = split(//,$genomic_sequence); + # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n"; + ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either + ### CpG, CHH or CHG context + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ### U for methylated C in unknown context (was protected) ### + ### u for not methylated C in unknwon context (was converted) ### + ################################################################# + + my @match =(); + warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 + my $methyl_CHH_count = 0; + my $methyl_CHG_count = 0; + my $methyl_CpG_count = 0; + my $methyl_C_unknown_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CpG_count = 0; + my $unmethylated_C_unknown_count = 0; + + if ($read_conversion eq 'CT'){ + for my $index (0..$#seq) { + if ($seq[$index] eq $genomic[$index]) { + ### The residue can only be a C if it was not converted to T, i.e. protected my methylation + if ($genomic[$index] eq 'C') { + ### If the residue is a C we want to know if it was in CpG context or in any other context + my $downstream_base = $genomic[$index+1]; + + if ($downstream_base eq 'G'){ + ++$methyl_CpG_count; + push @match,'Z'; # protected C, methylated, in CpG context + } + elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$methyl_C_unknown_count; + push @match,'U'; # protected C, methylated, in Unknown context + } + else { + ### C in not in CpG-context, determining the second downstream base context + my $second_downstream_base = $genomic[$index+2]; + + if ($second_downstream_base eq 'G'){ + ++$methyl_CHG_count; + push @match,'X'; # protected C, methylated, in CHG context + } + elsif ($second_downstream_base eq 'N'){ + ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'U'; # protected C, methylated, in Unknown context + } + else{ + ++$methyl_CHH_count; + push @match,'H'; # protected C, methylated, in CHH context + } + } + } + else { + push @match, '.'; + } + } + elsif ($seq[$index] ne $genomic[$index]) { + ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts + ### in the actually observed sequence + if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') { + ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context + my $downstream_base = $genomic[$index+1]; + + if ($downstream_base eq 'G'){ + ++$unmethylated_CpG_count; + push @match,'z'; # converted C, not methylated, in CpG context + } + elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$unmethylated_C_unknown_count; + push @match,'u'; # converted C, not methylated, in Unknown context + } + else{ + ### C in not in CpG-context, determining the second downstream base context + my $second_downstream_base = $genomic[$index+2]; + + if ($second_downstream_base eq 'G'){ + ++$unmethylated_CHG_count; + push @match,'x'; # converted C, not methylated, in CHG context + } + elsif ($second_downstream_base eq 'N'){ + ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'u'; # converted C, not methylated, in Unknown context + } + else{ + ++$unmethylated_CHH_count; + push @match,'h'; # converted C, not methylated, in CHH context + } + } + } + ### all other mismatches are not of interest for a methylation call + else { + push @match,'.'; + } + } + else{ + die "There can be only 2 possibilities\n"; + } + } + } + elsif ($read_conversion eq 'GA'){ + # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n"; + + for my $index (0..$#seq) { + if ($seq[$index] eq $genomic[$index+2]) { + ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation + if ($genomic[$index+2] eq 'G') { + ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need + ### to look if the base upstream is a C + + my $upstream_base = $genomic[$index+1]; + + if ($upstream_base eq 'C'){ + ++$methyl_CpG_count; + push @match,'Z'; # protected C on opposing strand, methylated, in CpG context + } + elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$methyl_C_unknown_count; + push @match,'U'; # protected C on opposing strand, methylated, in Unknown context + } + else{ + ### C in not in CpG-context, determining the second upstream base context + my $second_upstream_base = $genomic[$index]; + + if ($second_upstream_base eq 'C'){ + ++$methyl_CHG_count; + push @match,'X'; # protected C on opposing strand, methylated, in CHG context + } + elsif ($second_upstream_base eq 'N'){ + ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'U'; # protected C, methylated, in Unknown context + } + else{ + ++$methyl_CHH_count; + push @match,'H'; # protected C on opposing strand, methylated, in CHH context + } + } + } + else{ + push @match, '.'; + } + } + elsif ($seq[$index] ne $genomic[$index+2]) { + ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts + ### on the opposing strand, so G to A conversions in the actually observed sequence + if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') { + ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if + ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream! + + my $upstream_base = $genomic[$index+1]; + + if ($upstream_base eq 'C'){ + ++$unmethylated_CpG_count; + push @match,'z'; # converted C on opposing strand, not methylated, in CpG context + } + elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$unmethylated_C_unknown_count; + push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context + } + else{ + ### C in not in CpG-context, determining the second upstream base context + my $second_upstream_base = $genomic[$index]; + + if ($second_upstream_base eq 'C'){ + ++$unmethylated_CHG_count; + push @match,'x'; # converted C on opposing strand, not methylated, in CHG context + } + elsif ($second_upstream_base eq 'N'){ + ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context + } + else{ + ++$unmethylated_CHH_count; + push @match,'h'; # converted C on opposing strand, not methylated, in CHH context + } + } + } + ### all other mismatches are not of interest for a methylation call + else { + push @match,'.'; + } + } + else{ + die "There can be only 2 possibilities\n"; + } + } + } + else{ + die "Strand conversion info is required to perform a methylation call\n"; + } + + my $methylation_call = join ("",@match); + + $counting{total_meCHH_count} += $methyl_CHH_count; + $counting{total_meCHG_count} += $methyl_CHG_count; + $counting{total_meCpG_count} += $methyl_CpG_count; + $counting{total_meC_unknown_count} += $methyl_C_unknown_count; + $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; + $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; + $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; + $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; + + # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; + + return $methylation_call; +} + +sub read_genome_into_memory{ + ## working directoy + my $cwd = shift; + ## reading in and storing the specified genome in the %chromosomes hash + chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; + warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; + + my @chromosome_filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@chromosome_filenames){ + @chromosome_filenames = <*.fasta>; + } + + unless (@chromosome_filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; + } + + my $SQ_count = 0; + + foreach my $chromosome_filename (@chromosome_filenames){ + + open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; + ### first line needs to be a fastA header + my $first_line = ; + chomp $first_line; + $first_line =~ s/\r//; + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + my $sequence; + + while (){ + chomp; + $_ =~ s/\r//; # removing carriage returns if present + if ($_ =~ /^>/){ + + ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) + if (exists $chromosomes{$chromosome_name}){ + print "chr $chromosome_name (",length $sequence ," bp)\n"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else { + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; + } + print "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + $SQ_order{$SQ_count} = $chromosome_name; + + ++$SQ_count; + + } + ### resetting the sequence variable + $sequence = ''; + ### setting new chromosome name + $chromosome_name = extract_chromosome_name($_); + } + else{ + $sequence .= uc$_; + } + } + + ### Processing last chromosome of a multi Fasta File or the only entry in case of single entry FastA files + + if (exists $chromosomes{$chromosome_name}){ + print "chr $chromosome_name (",length $sequence ," bp)\t"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; + } + else{ + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; + } + + ++$SQ_count; + + print "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + $SQ_order{$SQ_count} = $chromosome_name; + } + } + print "\n"; + chdir $cwd or die "Failed to move to directory $cwd\n"; +} + +sub extract_chromosome_name { + ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well + my $fasta_header = shift; + if ($fasta_header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$fasta_header); + return $chromosome_name; + } + else{ + die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; + } +} + +sub reverse_complement{ + my $sequence = shift; + $sequence =~ tr/CATG/GTAC/; + $sequence = reverse($sequence); + return $sequence; +} + +sub biTransformFastAFiles { + my $file = shift; + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + if ($gzip){ + $C_to_T_infile =~ s/$/_C_to_T.fa.gz/; + $G_to_A_infile =~ s/$/_G_to_A.fa.gz/; + } + else{ + $C_to_T_infile =~ s/$/_C_to_T.fa/; + $G_to_A_infile =~ s/$/_G_to_A.fa/; + } + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + + unless ($directional){ + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + + my $count = 0; + + while (1){ + my $header = ; + my $sequence= ; + last unless ($header and $sequence); + + $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($header,"\t") != -1){ + $seqID_contains_tabs++; + } + + ### small check if the sequence seems to be in FastA format + die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); + + my $sequence_C_to_T = $sequence; + $sequence_C_to_T =~ tr/C/T/; + print CTOT "$header$sequence_C_to_T"; + + unless ($directional){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA "$header$sequence_G_to_A"; + } + } + close CTOT or die "Failed to close filehandle $!\n"; + + if ($directional){ + warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + else{ + close GTOA or die "Failed to close filehandle $!\n"; + warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + return ($C_to_T_infile,$G_to_A_infile); +} + +sub biTransformFastAFiles_paired_end { + my ($file,$read_number) = @_; + + if ($gzip){ + warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n"; + sleep (2); + } + + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + $C_to_T_infile =~ s/$/_C_to_T.fa/; + $G_to_A_infile =~ s/$/_G_to_A.fa/; + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + elsif ($read_number == 2){ + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + else{ + die "Read number needs to be 1 or 2, but was: $read_number\n\n"; + } + } + else{ # all four strand output + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + + my $count = 0; + + while (1){ + my $header = ; + my $sequence= ; + last unless ($header and $sequence); + + $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($header,"\t") != -1){ + $seqID_contains_tabs++; + } + + ## small check if the sequence seems to be in FastA format + die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/); + + if ($read_number == 1){ + if ($bowtie2){ + $header =~ s/$/\/1\/1/; + } + else{ + $header =~ s/$/\/1/; + } + } + elsif ($read_number == 2){ + if ($bowtie2){ + $header =~ s/$/\/2\/2/; + } + else{ + $header =~ s/$/\/2/; + } + } + else{ + die "Read number needs to be 1 or 2, but was: $read_number\n\n"; + } + my $sequence_C_to_T = my $sequence_G_to_A = $sequence; + + $sequence_C_to_T =~ tr/C/T/; + $sequence_G_to_A =~ tr/G/A/; + + if ($directional){ + + if ($read_number == 1){ + print CTOT "$header$sequence_C_to_T"; + } + elsif ($read_number == 2){ + print GTOA "$header$sequence_G_to_A"; + } + } + else{ + print CTOT "$header$sequence_C_to_T"; + print GTOA "$header$sequence_G_to_A"; + } + } + + if ($directional){ + if ($read_number == 1){ + warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n"; + } + else{ + warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n"; + } + } + else{ + warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + return ($C_to_T_infile); + } + else{ + return ($G_to_A_infile); + } + } + else{ + return ($C_to_T_infile,$G_to_A_infile); + } +} + + +sub biTransformFastQFiles { + my $file = shift; + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + if ($pbat){ # PBAT-Seq + if ($gzip){ + $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; + } + else{ + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + } + + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + else{ # directional or non-directional + if ($gzip){ + $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; + } + else{ + $C_to_T_infile =~ s/$/_C_to_T.fastq/; + } + + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option + } + + unless ($directional){ + if ($gzip){ + $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; + } + else{ + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + } + + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + } + + my $count = 0; + while (1){ + my $identifier = ; + my $sequence = ; + my $identifier2 = ; + my $quality_score = ; + last unless ($identifier and $sequence and $identifier2 and $quality_score); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($identifier,"\t") != -1){ + $seqID_contains_tabs++; + } + + ## small check if the sequence file appears to be a FastQ file + if ($count == 1){ + if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ + die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + } + + if ($pbat){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + else{ # directional or non-directional + my $sequence_C_to_T = $sequence; + $sequence_C_to_T =~ tr/C/T/; + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + + unless ($directional){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + } + + if ($directional){ + close CTOT or die "Failed to close filehandle $!\n"; + warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + elsif($pbat){ + warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; + close GTOA or die "Failed to close filehandle $!\n"; + return ($G_to_A_infile); + } + else{ + close CTOT or die "Failed to close filehandle $!\n"; + close GTOA or die "Failed to close filehandle $!\n"; + warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; + } + + return ($C_to_T_infile,$G_to_A_infile); +} + +sub biTransformFastQFiles_paired_end { + my ($file,$read_number) = @_; + my ($dir,$filename); + + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + if ($gzip){ + $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; + $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; + } + else{ + $C_to_T_infile =~ s/$/_C_to_T.fastq/; + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + } + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + } + elsif ($read_number == 2){ + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + else{ + die "Read number needs to be 1 or 2, but was $read_number!\n\n"; + } + } + else{ + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + + my $count = 0; + while (1){ + my $identifier = ; + my $sequence = ; + my $identifier2 = ; + my $quality_score = ; + last unless ($identifier and $sequence and $identifier2 and $quality_score); + ++$count; + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence= uc$sequence; # make input file case insensitive + + ## small check if the sequence file appears to be a FastQ file + if ($count == 1){ + if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ + die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + } + my $sequence_C_to_T = my $sequence_G_to_A = $sequence; + + if ($read_number == 1){ + if ($bowtie2){ + $identifier =~ s/$/\/1\/1/; + } + else{ + $identifier =~ s/$/\/1/; + } + } + elsif ($read_number == 2){ + if ($bowtie2){ + $identifier =~ s/$/\/2\/2/; + } + else{ + $identifier =~ s/$/\/2/; + } + } + else{ + die "Read number needs to be 1 or 2\n"; + } + + $sequence_C_to_T =~ tr/C/T/; + $sequence_G_to_A =~ tr/G/A/; + + if ($directional){ + if ($read_number == 1){ + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + } + else{ + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + else{ + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + + if ($directional){ + if ($read_number == 1){ + warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + else{ + warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + } + else{ + warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; + } + if ($directional){ + if ($read_number == 1){ + close CTOT or die "Failed to close filehandle $!\n"; + return ($C_to_T_infile); + } + else{ + close GTOA or die "Failed to close filehandle $!\n"; + return ($G_to_A_infile); + } + } + else{ + close CTOT or die "Failed to close filehandle $!\n"; + close GTOA or die "Failed to close filehandle $!\n"; + return ($C_to_T_infile,$G_to_A_infile); + } +} + + +### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES + +sub biTransformFastQFiles_paired_end_bowtie1_gzip { + my ($file_1,$file_2) = @_; + my ($dir,$filename); + + if ($file_1 =~ /\//){ + ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file_1; + } + + ### gzipped version of infile 1 + if ($file_1 =~ /\.gz$/){ + open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n"; + } + else{ + open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n"; + } + ### gzipped version of infile 2 + if ($file_2 =~ /\.gz$/){ + open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n"; + } + else{ + open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n"; + } + + + if ($skip){ + warn "Skipping the first $skip reads from $file_1 and $file_2\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n"; + sleep (1); + } + + my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; + $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; + $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; + # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; + } + + $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; + $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; + # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; + + warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; + open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; + # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; + + unless ($directional){ + print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n"; + open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n"; + } + + ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format: + ### + + my $count = 0; + while (1){ + my $identifier_1 = ; + my $sequence_1 = ; + my $identifier2_1 = ; + my $quality_score_1 = ; + + my $identifier_2 = ; + my $sequence_2 = ; + my $identifier2_2 = ; + my $quality_score_2 = ; + + last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2); + + ++$count; + + ## small check if the sequence file appears to be a FastQ file + if ($count == 1){ + if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){ + die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){ + die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + } + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + chomp $identifier_1; + chomp $sequence_1; + chomp $sequence_2; + chomp $quality_score_1; + chomp $quality_score_2; + + $identifier_1 =~ s/^\@//; + $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever) + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence_1 = uc$sequence_1; # make input file 1 case insensitive + $sequence_2 = uc$sequence_2; # make input file 2 case insensitive + + # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n"; + my $sequence_1_C_to_T = $sequence_1; + my $sequence_2_G_to_A = $sequence_2; + $sequence_1_C_to_T =~ tr/C/T/; + $sequence_2_G_to_A =~ tr/G/A/; + + print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n"; + + unless ($directional){ + my $sequence_1_G_to_A = $sequence_1; + my $sequence_2_C_to_T = $sequence_2; + $sequence_1_G_to_A =~ tr/G/A/; + $sequence_2_C_to_T =~ tr/C/T/; + print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n"; + } + } + + close CTPLUSGA or die "Couldn't close filehandle\n"; + warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n"; + + if ($directional){ + warn "\n"; + return ($CT_plus_GA_infile); + } + else{ + close GAPLUSCT or die "Couldn't close filehandle\n"; + warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n"; + return ($CT_plus_GA_infile,$GA_plus_CT_infile); + } +} + + +sub fix_IDs{ + my $id = shift; + $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores + return $id; +} + +sub ensure_sensical_alignment_orientation_single_end{ + my $index = shift; # index number if the sequence produced an alignment + my $strand = shift; + ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one + my $orientation = 0; + ############################################################################################################## + ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T) + ## here we only want reads in the forward (+) orientation + if ($fhs[$index]->{name} eq 'CTreadCTgenome') { + ### if the alignment is (+) we count it, and return 1 for a correct orientation + if ($strand eq '+') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (-) the alignment is nonsensical + elsif ($strand eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A) + ## here we only want reads in the forward (-) orientation + elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') { + ### if the alignment is (-) we count it and return 1 for a correct orientation + if ($strand eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (+) the alignment is nonsensical + elsif ($strand eq '+') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T) + ## here we only want reads in the forward (-) orientation + elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') { + ### if the alignment is (-) we count it and return 1 for a correct orientation + if ($strand eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (+) the alignment is nonsensical + elsif ($strand eq '+') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A) + ## here we only want reads in the forward (+) orientation + elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') { + ### if the alignment is (+) we count it and return 1 for a correct orientation + if ($strand eq '+') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (-) the alignment is nonsensical + elsif ($strand eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } else{ + die "One of the above conditions must be true\n"; + } +} + +sub ensure_sensical_alignment_orientation_paired_ends{ + my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment + ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one + my $orientation = 0; + ############################################################################################################## + ## [Index 0, sequence originated from (converted) forward strand] + ## CT converted read 1 + ## GA converted read 2 + ## CT converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') { + ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation + if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 1, sequence originated from (converted) reverse strand] + ## GA converted read 1 + ## CT converted read 2 + ## GA converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') { + ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation + if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 2, sequence originated from complementary to (converted) forward strand] + ## GA converted read 1 + ## CT converted read 2 + ## CT converted genome + ## here we only want read 1 in (-) orientation and read 2 in (+) orientation + elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') { + ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation + if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 3, sequence originated from complementary to (converted) reverse strand] + ## CT converted read 1 + ## GA converted read 2 + ## GA converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') { + ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation + if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + else{ + die "One of the above conditions must be true\n"; + } +} + +##################################################################################################################################################### + +### Bowtie 1 (default) | PAIRED-END | FASTA + +sub paired_end_align_fragments_to_bisulfite_genome_fastA { + + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + + if ($directional){ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + my $line_1 = $fh->{fh}->getline(); + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 2 | PAIRED-END | FASTA + +sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + if ($directional){ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + my $line_1 = $_; + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 1 (default) | PAIRED-END | FASTQ + +sub paired_end_align_fragments_to_bisulfite_genome_fastQ { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + + if ($directional){ + warn "Input file is $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; + } + elsif($pbat){ + warn "Input file is $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ; PBAT-Seq)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 and $G_to_A_infile_1 and $C_to_T_infile_2 (non-directional; FastQ)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional or $pbat){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; # skipping unwanted filehandles + } + } + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + if ($gzip){ + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n"; + open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!"; + } + else{ + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n"; + sleep(5); + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + } + + my $line_1 = $fh->{fh}->getline(); + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 2 | PAIRED-END | FASTQ + +sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { + + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + if ($directional){ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; + } + elsif ($pbat){ + warn "Input files are $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; + } + + ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional or $pbat){ # skipping unwanted filehandles + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + my $line_1 = $_; + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +##################################################################################################################################################### + +### Bowtie 1 (default) | SINGLE-END | FASTA +sub single_end_align_fragments_to_bisulfite_genome_fastA { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; + if ($gzip){ + open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; + } + else{ + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data + } + + # if Bowtie produces an alignment we store the first line of the output + $_ = $fh->{fh}->getline(); + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +### Bowtie 2 | SINGLE-END | FASTA +sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie 2: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + + +### Bowtie 1 (default) | SINGLE-END | FASTQ +sub single_end_align_fragments_to_bisulfite_genome_fastQ { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastQ)\n"; + } + elsif($pbat){ + warn "Input file is $G_to_A_infile (FastQ)\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n"; + } + + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## the data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; + sleep (5); + + if ($gzip){ + open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; + } + else{ + open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data + } + + # if Bowtie produces an alignment we store the first line of the output + $_ = $fh->{fh}->getline(); + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +### Bowtie 2 | SINGLE-END | FASTQ +sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { + + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastQ)\n\n"; + } + elsif ($pbat){ + warn "Input file is $G_to_A_infile (FastQ)\n\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n"; + } + + ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## the data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n"; + warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n"; + + open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + # warn "$_\n"; + # sleep(1); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else { + last; + } + } + + # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + # warn "storing $id and\n$_\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +########################################################################################################################################### + +sub reset_counters_and_fhs{ + my $filename = shift; + %counting=( + total_meCHH_count => 0, + total_meCHG_count => 0, + total_meCpG_count => 0, + total_meC_unknown_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CpG_count => 0, + total_unmethylated_C_unknown_count => 0, + sequences_count => 0, + no_single_alignment_found => 0, + unsuitable_sequence_count => 0, + genomic_sequence_could_not_be_extracted_count => 0, + unique_best_alignment_count => 0, + low_complexity_alignments_overruled_count => 0, + CT_CT_count => 0, #(CT read/CT genome, original top strand) + CT_GA_count => 0, #(CT read/GA genome, original bottom strand) + GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand) + GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand) + CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand) + GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand) + GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand) + CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand) + alignments_rejected_count => 0, # only relevant if --directional was specified + ); + + if ($directional){ + if ($filename =~ ','){ # paired-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + else{ # single-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + } + elsif($pbat){ + if ($filename =~ ','){ # paired-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + else{ # single-end files + @fhs=( + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + } + else{ + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } +} + + +sub process_command_line{ + my @bowtie_options; + my $help; + my $mates1; + my $mates2; + my $path_to_bowtie; + my $fastq; + my $fasta; + my $skip; + my $qupto; + my $phred64; + my $phred33; + my $solexa; + my $mismatches; + my $seed_length; + my $best; + my $sequence_format; + my $version; + my $quiet; + my $chunk; + my $non_directional; + my $ceiling; + my $maxins; + my $minins; + my $unmapped; + my $multi_map; + my $output_dir; + my $bowtie2; + my $vanilla; + my $sam_no_hd; + my $seed_extension_fails; + my $reseed_repetitive_seeds; + my $most_valid_alignments; + my $score_min; + my $parallel; + my $temp_dir; + my $rdg; + my $rfg; + my $non_bs_mm; + my $samtools_path; + my $bam; + my $gzip; + my $pbat; + my $prefix; + my $old_flag; + my $basename; + my $sam; + my $multicore; + + my $command_line = GetOptions ('help|man' => \$help, + '1=s' => \$mates1, + '2=s' => \$mates2, + 'path_to_bowtie=s' => \$path_to_bowtie, + 'f|fasta' => \$fasta, + 'q|fastq' => \$fastq, + 's|skip=i' => \$skip, + 'u|upto=i' => \$qupto, + 'phred33-quals' => \$phred33, + 'phred64-quals|solexa1' => \$phred64, + 'solexa-quals' => \$solexa, + 'n|seedmms=i' => \$mismatches, + 'l|seedlen=i' => \$seed_length, + 'no_best' => \$best, + 'version' => \$version, + 'quiet' => \$quiet, + 'chunkmbs=i' => \$chunk, + 'non_directional' => \$non_directional, + 'I|minins=i' => \$minins, + 'X|maxins=i' => \$maxins, + 'e|maqerr=i' => \$ceiling, + 'un|unmapped' => \$unmapped, + 'ambiguous' => \$multi_map, + 'o|output_dir=s' => \$output_dir, + 'bowtie2' => \$bowtie2, + 'vanilla' => \$vanilla, + 'sam-no-hd' => \$sam_no_hd, + 'D=i' => \$seed_extension_fails, + 'R=i' => \$reseed_repetitive_seeds, + 'score_min=s' => \$score_min, + 'most_valid_alignments=i' => \$most_valid_alignments, + 'p=i' => \$parallel, + 'temp_dir=s' => \$temp_dir, + 'rdg=s' => \$rdg, + 'rfg=s' => \$rfg, + 'non_bs_mm' => \$non_bs_mm, + 'samtools_path=s' => \$samtools_path, + 'bam' => \$bam, + 'gzip' => \$gzip, + 'pbat' => \$pbat, + 'prefix=s' => \$prefix, + 'old_flag' => \$old_flag, + 'B|basename=s' => \$basename, + 'sam' => \$sam, + 'multicore=i' => \$multicore, + ); + + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + if ($version){ + print << "VERSION"; + + + Bismark - Bisulfite Mapper and Methylation Caller. + + Bismark Version: $bismark_version + Copyright 2010-15 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/ + + +VERSION + exit; + } + + + ########################## + ### PROCESSING OPTIONS ### + ########################## + + unless ($bowtie2){ + $bowtie2 = 0; + } + unless ($sam_no_hd){ + $sam_no_hd =0; + } + + ### PATH TO BOWTIE + ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH + if ($path_to_bowtie){ + unless ($path_to_bowtie =~ /\/$/){ + $path_to_bowtie =~ s/$/\//; + } + if (-d $path_to_bowtie){ + if ($bowtie2){ + $path_to_bowtie = "${path_to_bowtie}bowtie2"; + } + else{ + $path_to_bowtie = "${path_to_bowtie}bowtie"; + } + } + else{ + die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n"; + } + } + else{ + if ($bowtie2){ + $path_to_bowtie = 'bowtie2'; + warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; } + else{ + $path_to_bowtie = 'bowtie'; + warn "Path to Bowtie specified as: $path_to_bowtie\n"; + } + } + + + if ($sam){ + warn "Output format manually set as SAM\n"; + } + else{ + $bam = 1; + warn "Output format is BAM (default)\n"; + } + + ### OUTPUT REQUESTED AS BAM FILE (default) + if ($bam){ + if ($vanilla){ + die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n"; + } + + ### PATH TO SAMTOOLS + if (defined $samtools_path){ + # if Samtools was specified as full command + if ($samtools_path =~ /samtools$/){ + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + else{ + unless ($samtools_path =~ /\/$/){ + $samtools_path =~ s/$/\//; + } + $samtools_path .= 'samtools'; + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + + warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n"; + $bam = 1; + } + # Check whether Samtools is in the PATH if no path was supplied by the user + else{ + if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH + $samtools_path = `which samtools`; + chomp $samtools_path; + warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n"; + $bam = 1; + } + } + + unless (defined $samtools_path){ + $bam = 2; + warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n"; + } + sleep (1); + } + + + #################################### + ### PROCESSING ARGUMENTS + + ### GENOME FOLDER + my $genome_folder = shift @ARGV; # mandatory + unless ($genome_folder){ + warn "Genome folder was not specified!\n"; + print_helpfile(); + exit; + } + + ### checking that the genome folder, all subfolders and the required bowtie index files exist + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + + if (chdir $genome_folder){ + my $absolute_genome_folder = getcwd; ## making the genome folder path absolute + unless ($absolute_genome_folder =~/\/$/){ + $absolute_genome_folder =~ s/$/\//; + } + warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n"; + $genome_folder = $absolute_genome_folder; + } + else{ + die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] {-1 -2 | } [] (--help for more details)\n"; + } + + my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/"; + my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/"; + + my $bt2_small_index_present = 1; + my $bt2_large_index_present = 1; + + if ($bowtie2){ ### Bowtie 2 + + ### Checking for small indixes first (ending in .bt2) + + # checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + + my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + warn "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n"; + $bt2_small_index_present = 0; + } + } + # checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); + + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + warn "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n"; + $bt2_small_index_present = 0; + } + } + + ### Using the small index preferentially + if ($bt2_small_index_present){ + $bt2_large_index_present = 0; + } + else{ # only checking for large indexes if the 'normal' one can't be found + warn "\nCouldn't find a traditional small Bowtie 2 index for the genome specified (ending in .bt2). Now searching for a large index instead (64-bit index ending in .bt2l)...\n"; + + ### If no small small indexes were found we look for large indexes (64-bit indexes, ending in .bt2l) + + # checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + + @CT_bowtie_index = ('BS_CT.1.bt2l','BS_CT.2.bt2l','BS_CT.3.bt2l','BS_CT.4.bt2l','BS_CT.rev.1.bt2l','BS_CT.rev.2.bt2l'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + die "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n"; + $bt2_large_index_present = 0; } + } + + ### checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + @GA_bowtie_index = ('BS_GA.1.bt2l','BS_GA.2.bt2l','BS_GA.3.bt2l','BS_GA.4.bt2l','BS_GA.rev.1.bt2l','BS_GA.rev.2.bt2l'); + + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + die "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n"; + $bt2_large_index_present = 0; + } + } + + if ($bt2_large_index_present){ + warn "64-bit large genome Bowtie 2 index found...\n"; + } + else{ + die "Failed to detect either a standard (.bt2) or 64-bit (.bt2l) Bowtie 2 index for the genome specified. Please run the bismark_genome_preparation before launching Bismark\n\n"; + } + } + + } + + else{ ### Bowtie 1 (default) + ### checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; + } + } + ### checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; + } + } + } + + my $CT_index_basename = "${CT_dir}BS_CT"; + my $GA_index_basename = "${GA_dir}BS_GA"; + + ### INPUT OPTIONS + + ### SEQUENCE FILE FORMAT + ### exits if both fastA and FastQ were specified + if ($fasta and $fastq){ + die "Only one sequence filetype can be specified (fastA or fastQ)\n"; + } + + ### unless fastA is specified explicitely, fastQ sequence format is expected by default + if ($fasta){ + print "FastA format specified\n"; + $sequence_format = 'FASTA'; + push @bowtie_options, '-f'; + } + elsif ($fastq){ + print "FastQ format specified\n"; + $sequence_format = 'FASTQ'; + push @bowtie_options, '-q'; + } + else{ + $fastq = 1; + print "FastQ format assumed (by default)\n"; + $sequence_format = 'FASTQ'; + push @bowtie_options, '-q'; + } + + ### SKIP + if ($skip){ + warn "Skipping the first $skip reads from the input file\n"; + # push @bowtie_options,"-s $skip"; + } + + ### UPTO + if ($qupto){ + warn "Processing sequences up to read no. $qupto from the input file\n"; + if ($bowtie2){ + # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2 + } + else{ + # push @bowtie_options,"--qupto $qupto"; + } + } + + ### QUALITY VALUES + if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){ + die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)"; + } + if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2. + # Phred quality values work only when -q is specified + unless ($fastq){ + die "Phred quality values works only when -q (FASTQ) is specified\n"; + } + if ($bowtie2){ + push @bowtie_options,"--phred33"; + } + else{ + push @bowtie_options,"--phred33-quals"; + } + } + if ($phred64){ + # Phred quality values work only when -q is specified + unless ($fastq){ + die "Phred quality values work only when -q (FASTQ) is specified\n"; + } + if ($bowtie2){ + push @bowtie_options,"--phred64"; + } + else{ + push @bowtie_options,"--phred64-quals"; + } + } + else{ + $phred64 = 0; + } + + if ($solexa){ + if ($bowtie2){ + die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n"; + } + # Solexa to Phred value conversion works only when -q is specified + unless ($fastq){ + die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n"; + } + push @bowtie_options,"--solexa-quals"; + } + else{ + $solexa = 0; + } + + ### ALIGNMENT OPTIONS + + ### MISMATCHES + if (defined $mismatches){ + if ($bowtie2){ + if ($mismatches == 0 or $mismatches == 1){ + push @bowtie_options,"-N $mismatches"; + } + else{ + die "Please set the number of multiseed mismatches for Bowtie 2 with '-N ' (where can be 0 or 1)\n"; + } + } + else{ + if ($mismatches >= 0 and $mismatches <= 3){ + push @bowtie_options,"-n $mismatches"; + } + else{ + die "Please set the number of seed mismatches for Bowtie 1 with '-n ' (where can be 0,1,2 or 3)\n"; + } + } + } + else{ + unless ($bowtie2){ + push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2 + } + } + + ### SEED LENGTH + if (defined $seed_length){ + if ($bowtie2){ + push @bowtie_options,"-L $seed_length"; + } + else{ + push @bowtie_options,"-l $seed_length"; + } + } + + ### MISMATCH CEILING + if (defined $ceiling){ + die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2); + push @bowtie_options,"-e $ceiling"; + } + + + ### BOWTIE 2 EFFORT OPTIONS + + ### CONSECUTIVE SEED EXTENSION FAILS + if (defined $seed_extension_fails){ + die "The option '-D ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + push @bowtie_options,"-D $seed_extension_fails"; + } + + ### RE-SEEDING REPETITIVE SEEDS + if (defined $reseed_repetitive_seeds){ + die "The option '-R ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + push @bowtie_options,"-R $reseed_repetitive_seeds"; + } + + + ### BOWTIE 2 SCORING OPTIONS + + my ($score_min_intercept, $score_min_slope); + + if ($score_min){ + die "The option '--score_min ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + + unless ($score_min =~ /^L,(.+),(.+)$/){ + die "The option '--score_min ' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + ($score_min_intercept, $score_min_slope) = ($1, $2); + push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2 + } + else{ + if ($bowtie2){ + ($score_min_intercept, $score_min_slope) = (0, -0.2); + push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2 + } + } + + ### BOWTIE 2 READ GAP OPTIONS + my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend); + + if ($rdg){ + die "The option '--rdg ,' is only available when using Bowtie 2\n\n" unless ($bowtie2); + if ($rdg =~ /^(\d+),(\d+)$/){ + $deletion_open = $1; + $deletion_extend = $2; + } + else{ + die "The option '--rdg ,' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + push @bowtie_options,"--rdg $rdg"; + } + else{ + $deletion_open = 5; + $deletion_extend = 3; + } + + ### BOWTIE 2 REFERENCE GAP OPTIONS + if ($rfg){ + die "The option '--rfg ,' is only available when using Bowtie 2\n\n" unless ($bowtie2); + if ($rfg =~ /^(\d+),(\d+)$/){ + $insertion_open = $1; + $insertion_extend = $2; + } + else{ + die "The option '--rfg ,' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + push @bowtie_options,"--rfg $rfg"; + } + else{ + $insertion_open = 5; + $insertion_extend = 3; + } + + + ### BOWTIE 2 PARALLELIZATION OPTIONS + if (defined $parallel){ + die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2); + } + if ($bowtie2){ + if ($parallel){ + die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); + if ($parallel > 4){ + warn "Attention: using more than 4 cores per alignment thread has been reported to have diminishing returns. If possible try to limit -p to a value of 4\n"; sleep(2); + } + push @bowtie_options,"-p $parallel"; + push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work. + print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n"; + sleep (2); + } + } + + ### REPORTING OPTIONS + + if ($bowtie2){ + push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default + + ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while + if(defined $most_valid_alignments){ + + warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n"; + } + } + else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1 + push @bowtie_options,'-k 2'; + } + + ### --BEST + if ($bowtie2){ + if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used + die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n"; + } + } + else{ + # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process) + unless ($best){ + push @bowtie_options,'--best'; + } + } + + ### VANILLA BISMARK (BOWTIE 1) OUTPUT + if ($vanilla){ + if ($bowtie2){ + die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n"; + } + } + else{ + $vanilla = 0; + } + + ### PAIRED-END MAPPING + if ($mates1){ + my @mates1 = (split (/,/,$mates1)); + die "Paired-end mapping requires the format: -1 -2 , please respecify!\n" unless ($mates2); + my @mates2 = (split(/,/,$mates2)); + unless (scalar @mates1 == scalar @mates2){ + die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 -2 )\n"; + } + while (1){ + my $mate1 = shift @mates1; + my $mate2 = shift @mates2; + last unless ($mate1 and $mate2); + push @filenames,"$mate1,$mate2"; + } + if ($bowtie2){ + push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments + push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones + } + + if ($old_flag){ + warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); + sleep(3); + } + } + elsif ($mates2){ + die "Paired-end mapping requires the format: -1 -2 , please respecify!\n"; + } + + ### SINGLE-END MAPPING + # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified + my $singles; + unless ($mates1 and $mates2){ + $singles = join (',',@ARGV); + unless ($singles){ + die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n"; + } + $singles =~ s/\s/,/g; + @filenames = (split(/,/,$singles)); + warn "\nFiles to be analysed:\n"; + warn "@filenames\n\n"; + sleep (3); + } + + ### MININUM INSERT SIZE (PAIRED-END ONLY) + if (defined $minins){ + die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles); + push @bowtie_options,"--minins $minins"; + } + + ### MAXIMUM INSERT SIZE (PAIRED-END ONLY) + if (defined $maxins){ + die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles); + push @bowtie_options,"--maxins $maxins"; + } + else{ + unless ($singles){ + push @bowtie_options,'--maxins 500'; + } + } + + ### QUIET prints nothing besides alignments (suppresses warnings) + if ($quiet){ + push @bowtie_options,'--quiet'; + } + + ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments + unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option + if (defined $chunk){ + push @bowtie_options,"--chunkmbs $chunk"; + } + else{ + push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default) + } + } + + + ### SUMMARY OF ALL BOWTIE OPTIONS + my $bowtie_options = join (' ',@bowtie_options); + + + ### STRAND-SPECIFIC LIBRARIES + my $directional; + if ($non_directional){ + die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat); + warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n"; + sleep (1); + $directional = 0; + } + elsif($pbat){ + die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip); + die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta); + + warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n"; + sleep (1); + $directional = 0; + } + else{ + warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n"; + sleep (1); + $directional = 1; # default behaviour + } + + ### UNMAPPED SEQUENCE OUTPUT + $unmapped = 0 unless ($unmapped); + + ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT + $multi_map = 0 unless ($multi_map); + + + ### OUTPUT DIRECTORY + + chdir $parent_dir or die "Failed to move back to current working directory\n"; + if ($output_dir){ + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + + if (chdir $output_dir){ + $output_dir = getcwd; # making the path absolute + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + else{ + mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; + warn "Created output directory $output_dir!\n\n"; + chdir $output_dir or die "Failed to move to $output_dir\n"; + $output_dir = getcwd; # making the path absolute + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + warn "Output will be written into the directory: $output_dir\n"; + } + else{ + $output_dir = ''; + } + + ### TEMPORARY DIRECTORY for C->T and G->A transcribed files + + chdir $parent_dir or die "Failed to move back to current working directory\n"; + if ($temp_dir){ + warn "\nUsing temp directory: $temp_dir\n"; + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + + if (chdir $temp_dir){ + $temp_dir = getcwd; # making the path absolute + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + } + else{ + mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n"; + warn "Created temporary directory $temp_dir!\n\n"; + chdir $temp_dir or die "Failed to move to $temp_dir\n"; + $temp_dir = getcwd; # making the path absolute + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + } + warn "Temporary files will be written into the directory: $temp_dir\n"; + } + else{ + $temp_dir = ''; + } + + ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE + if ($non_bs_mm){ + if ($vanilla){ + die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; + } + } + + ### PREFIX FOR OUTPUT FILES + if ($prefix){ + # removing trailing dots + + $prefix =~ s/\.+$//; + + warn "Using the following prefix for output files: $prefix\n\n"; + sleep(1); + } + + if (defined $multicore){ + unless ($multicore > 0){ + die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n"; + } + if ($multicore > 20){ + warn "Core usage currently set to more than 20 threads. This might fail horribly but let's see how it goes... (set value: $multicore)\n\n"; + } + if ($sam){ + die "The multicore function currently requires the output to be in BAM format, so please lose either option --sam or --multi\n"; + } + } + else{ + $multicore = 1; # default. Single-thread mode + warn "Setting parallelization to single-threaded (default)\n\n"; + } + + if ($basename and $multicore > 1){ + die "Specifying --basename in conjuction with --multicore is currently not supported (but we are aiming to fix this soon). Please lose either --basename or --multicore to proceed\n\n"; + } + + return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index_present,$multicore); +} + + + +sub generate_SAM_header{ + + print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order + + # Unordered printing of @SQ headers + # foreach my $chr (keys %chromosomes){ + # my $length = length ($chromosomes{$chr}); + # print "\@SQ\tSN:$chr\tLN:$length\n"; + # print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length + # } + + foreach my $chr (sort {$a<=>$b} keys %SQ_order){ + # warn "$chr\t$SQ_order{$chr}\n"; + my $length = length ($chromosomes{$SQ_order{$chr}}); + print OUT "\@SQ\tSN:$SQ_order{$chr}\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length + } + + print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version + +} + +### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format: +### O. Tam (2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011), A. Dei Rossi (2014) + +sub single_end_SAM_output{ + + my ($id,$actual_seq,$methylation_call_params,$qual) = @_; + my $strand = $methylation_call_params->{$id}->{alignment_strand}; + my $chr = $methylation_call_params->{$id}->{chromosome}; + my $start = $methylation_call_params->{$id}->{position}; + my $stop = $methylation_call_params->{$id}->{end_position}; + my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence}; + my $methcall = $methylation_call_params->{$id}->{methylation_call}; + my $read_conversion = $methylation_call_params->{$id}->{read_conversion}; + my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; + my $number_of_mismatches; + + if ($bowtie2){ + $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score}; + } + else{ + $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches}; + } + + ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" + ## FLAG: bitwise FLAG. Each bit is explained in the following table: + ## Bit Description Comment Value + ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1) + ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2) + ## 0x4 segment unmapped --- --- + ## 0x8 next segment in the template unmapped --- --- + ## 0x10 SEQ being reverse complemented value: 2**4 ( 16) + ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32) + ## 0x40 the first segment in the template read 1 value: 2**6 ( 64) + ## 0x80 the last segment in the template read 2 value: 2**7 (128) + ## 0x100 secondary alignment --- --- + ## 0x200 not passing quality controls --- --- + ## 0x400 PCR or optical duplicate --- --- + + ##### + + my $flag; # FLAG variable used for SAM format. + if ($strand eq "+"){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ + $flag = 0; # 0 for "+" strand (OT) + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ + $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand) + } + else{ + die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; + } + } + elsif ($strand eq "-"){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ + $flag = 16; # 16 for "-" strand (OB) + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ + $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand) + } + else{ + die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; + } + } + else{ + die "Unexpected strand information: $strand\n\n"; + } + + ##### + + my $mapq; + + if ($bowtie2){ + $mapq = $methylation_call_params->{$id}->{mapq}; + } + else{ + $mapq = 255; # Mapping quality is unavailable for use with Bowtie + } + + ##### + + my $cigar; + if ($bowtie2){ + $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2 + } + else{ + $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches) + } + + ##### + + my $rnext = "*"; # Paired-end variable + + ##### + + my $pnext = 0; # Paired-end variable + + ##### + + my $tlen = 0; # Paired-end variable + + ##### + + if ($read_conversion eq 'CT'){ + $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands + } + else{ + $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries + } + + if ($strand eq '-'){ + $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand + $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence + if ($cigar =~ /D/){ + $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} ); + } + $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well + } + + ##### + + my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string + # into the reference string. hemming_dist() + if ($bowtie2){ + $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + } + + my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences + + ##### + + my $MD_tag = make_mismatch_string($actual_seq, $ref_seq,$cigar,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag}); # Optional tag MD: string providing mismatched reference bases in the alignment (this does include indel information) + # my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!) + + ##### + + my $XM_tag; # Optional tag XM: Methylation Call String + if ($strand eq '+'){ + $XM_tag = "XM:Z:$methcall"; + } + elsif ($strand eq '-'){ + $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well + } + + ##### + + my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion + + ##### + + my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion + + ##### + + # Optionally calculating number of mismatches for Bowtie 2 alignments + + if ($non_bs_mm) { + if ($bowtie2) { + + $number_of_mismatches =~ s/-//; # removing the minus sign + + ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches + if ($cigar =~ /(D|I)/) { + # warn "$cigar\n"; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach (0..$#len) { + if ($ops[$_] eq 'M') { + # warn "skipping\n"; + next; # irrelevant + } + elsif ($ops[$_] eq 'I') { # insertion in the read sequence + $number_of_mismatches -= $insertion_open; + $number_of_mismatches -= $len[$_] * $insertion_extend; + # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; + } + elsif ($ops[$_] eq 'D') { # deletion in the read sequence + $number_of_mismatches -= $deletion_open; + $number_of_mismatches -= $len[$_] * $deletion_extend; + # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; + } + elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + else { + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + } + # warn "Alignment score $number_of_mismatches\n"; + # print "Mismatches $number_of_mismatches\n\n"; + } + ### Now we have InDel corrected alignment scores + + ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the + ### sequence contained more than 5 Ns, but this should occur close to never + + my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division + # warn "N count: $seq_N_count\n"; + $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count; + # warn "MM $number_of_mismatches\n"; + } + } + + #### + + my $XA_tag = "XA:Z:$number_of_mismatches"; + + ##### + + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + ### optionally print number of non-bisulfite mismatches + if ($non_bs_mm){ + print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n"; + } + else{ # default + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag)),"\n"; + } +} + +sub paired_end_SAM_output{ + + my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_; + my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand + my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2}; + my $chr = $methylation_call_params->{$id}->{chromosome}; + my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1}; + my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2}; + my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1}; + my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; + my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; + my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; + my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; + + my $id_1; + my $id_2; + + if ($old_flag){ + $id_1 = $id.'/1'; + $id_2 = $id.'/2'; + } + else{ + $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard + $id_2 = $id; + } + + # Allows all degenerate nucleotide sequences in reference genome + # die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHVX]/i; # X are padded nucleotides in case of insertions in the read + # die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHVX]/i; + + my $index; # used to store the srand origin of the alignment in a less convoluted way + + if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){ + $index = 0; ## this is OT (original top strand) + } + elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){ + $index = 1; ## this is CTOB (complementary to OB) + } + elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){ + $index = 2; ## this is CTOT (complementary to OT) + } + elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){ + $index = 3; ## this is OB (original bottom) + } + else { + die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n"; + } + + my $number_of_mismatches_1; + my $number_of_mismatches_2; + + if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine + $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default! + $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2}; + } + else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation + if ($index == 2 or $index == 3){ # CTOT or OB + $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default! + $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1}; + } + else{ # if the first read aligned in forward direction it is like for Bowtie 2 + $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default! + $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2}; + } + } + + + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the + ### first or last position. + + if ($index == 0 or $index == 3){ # OT or OB + $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2); + $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2); + } + else{ # CTOT or CTOB + $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2); + $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2); + } + + ##### + + my $start_read_1; + my $start_read_2; + # adjusting end positions + + if ($bowtie2){ + $start_read_1 = $methylation_call_params->{$id}->{position_1}; + $start_read_2 = $methylation_call_params->{$id}->{position_2}; + } + else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 + if ($strand_1 eq '+'){ # Read 1 aligns to the + strand + $start_read_1 = $methylation_call_params->{$id}->{start_seq_1}; + $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1; + } + else{ # read 1 is on the - strand + $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1; + $start_read_2 = $methylation_call_params->{$id}->{start_seq_1}; + } + } + + ##### + + my $end_read_1; + my $end_read_2; + # adjusting end positions + + if ($bowtie2){ + $end_read_1 = $methylation_call_params->{$id}->{end_position_1}; + $end_read_2 = $methylation_call_params->{$id}->{end_position_2}; + } + else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 + if ($strand_1 eq '+'){ # Read 1 aligns to the + strand + $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1; + $end_read_2 = $methylation_call_params->{$id}->{alignment_end}; + } + else{ + $end_read_1 = $methylation_call_params->{$id}->{alignment_end}; + $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1; + } + } + + ##### + + ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" + ## FLAG: bitwise FLAG. Each bit is explained in the following table: + ## Bit Description Comment Value + ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1) + ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2) + ## 0x4 segment unmapped --- --- + ## 0x8 next segment in the template unmapped --- --- + ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16) + ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32) + ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64) + ## 0x80 the last segment in the template read 2 value: 2^^7 (128) + ## 0x100 secondary alignment --- --- + ## 0x200 not passing quality controls --- --- + ## 0x400 PCR or optical duplicate --- --- + + ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account + + # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) + # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) + + my $flag_1; # FLAG variable used for SAM format + my $flag_2; + + ### The new default FLAG values have been suggested by Peter Hickey, Australia + + if ($index == 0){ # OT + unless ($old_flag){ + $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) + $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) + } + else{ + $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) + $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) + } + } + elsif ($index == 1){ # CTOB + unless($old_flag){ + $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) + $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) + } + else{ + $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) + } + } + elsif ($index == 2){ # CTOT + unless ($old_flag){ + $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) + $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) + } + else{ + $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) + $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) + } + } + elsif ($index == 3){ # OB + unless ($old_flag){ + $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) + $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) + } + else{ + $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) + } + } + + ##### + + my $mapq; + + if ($bowtie2){ + $mapq = $methylation_call_params->{$id}->{mapq}; + } + else{ + $mapq = 255; # Mapping quality is unavailable for use with Bowtie + } + + ##### + + my $cigar_1; + my $cigar_2; + + if ($bowtie2){ + $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2 + $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2}; + } + else{ + $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches) + $cigar_2 = length($actual_seq_2) . "M"; + } + + ##### + + my $rnext = '='; # Chromosome of mate; applies to both reads + + ##### + + my $pnext_1 = $start_read_2; # Leftmost position of mate + my $pnext_2 = $start_read_1; + + ##### + + my $tlen_1; # signed observed Template LENgth (or inferred fragment size) + my $tlen_2; + + if ($bowtie2){ + + if ($start_read_1 <= $start_read_2){ + + # Read 1 alignment is leftmost + + if ($end_read_2 >= $end_read_1){ + + # -------------------------> read 1 reads overlapping + # <------------------------- read 2 + # + # or + # + # -------------------------> read 1 + # <----------------------- read 2 read 2 contained within read 1 + # + # or + # + # -------------------------> read 1 reads 1 and 2 exactly overlapping + # <------------------------- read 2 + # + + # dovetailing of reads is not enabled for Bowtie 2 alignments + + $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, + $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign + } + elsif ($end_read_2 < $end_read_1){ + + # -------------------------> read 1 + # <----------- read 2 read 2 contained within read 1 + # + # or + # + # -------------------------> read 1 + # <------------------------ read 2 read 2 contained within read 1 + + # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable + $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, + $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this + ### as a request by frozenlyse on SeqAnswers on 24 July 2013 + } + + } + + elsif ($start_read_2 < $start_read_1){ + + if ($end_read_1 >= $end_read_2){ + + # Read 2 alignment is leftmost + + # -------------------------> read 2 reads overlapping + # <------------------------- read 1 + # + # or + # + # -------------------------> read 2 + # <----------------------- read 1 read 1 contained within read 2 + # + # + + $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, + $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign + } + elsif ($end_read_1 < $end_read_2){ + + # -------------------------> read 2 + # <----------- read 1 read 1 contained within read 2 + # + # or + # + # -------------------------> read 2 + # <------------------------ read 1 read 1 contained within read 2 + + # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable + $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, + $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this + ### as a request by frozenlyse on SeqAnswers on 24 July 2013 + } + } + } + + else{ # Bowtie 1 + + if ($end_read_2 >= $end_read_1){ + # Read 1 alignment is leftmost + # -------------------------> read 1 + # <------------------------- read 2 + # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing + + $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, + $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign + } + else{ + # Read 2 alignment is leftmost + # -------------------------> read 2 + # <------------------------- read 1 + # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing + + $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, + $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign + } + } + + ##### + + # adjusting the strand of the sequence before we use them to generate mismatch strings + if ($strand_1 eq '-'){ + $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand + $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence + if ($cigar_1 =~ /D/){ + $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} ); + } + $qual_1 = reverse $qual_1; # we need to reverse the quality string as well + } + if ($strand_2 eq '-'){ + $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand + $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence + if ($cigar_2 =~ /D/){ + $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} ); + } + $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well + } + + # print "$actual_seq_1\n$ref_seq_1\n\n"; + # print "$actual_seq_2\n$ref_seq_2\n\n"; + + ##### + + my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence + my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2); + if ($bowtie2){ + $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + } + my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences + my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences + + ##### + + my $MD_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1,$cigar_1,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1}); # Optional tag MD: String providing mismatched reference bases in the alignment (including indel information) + my $MD_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2,$cigar_2,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2}); + + # my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!) + # my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2); + + ##### + + my $XM_tag_1; # Optional tag XM: Methylation call string + my $XM_tag_2; + + if ($strand_1 eq '-'){ + $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented + } + else{ + $XM_tag_1 = "XM:Z:$methcall_1"; + } + + if ($strand_2 eq '-'){ + $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented + } + else{ + $XM_tag_2 = "XM:Z:$methcall_2"; + } + + ##### + + my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state + my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state + + ##### + + my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads + + ##### + + # Optionally calculating number of mismatches for Bowtie 2 alignments + + if ($non_bs_mm) { + if ($bowtie2) { + + $number_of_mismatches_1 =~ s/-//; # removing the minus sign + $number_of_mismatches_2 =~ s/-//; + + ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches + + ### CIGAR 1 + if ($cigar_1 =~ /(D|I)/) { + # warn "$cigar_1\n"; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar_1); # storing the length per operation + my @ops = split (/\d+/,$cigar_1); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach (0..$#len) { + if ($ops[$_] eq 'M') { + # warn "skipping\n"; + next; # irrelevant + } + elsif ($ops[$_] eq 'I') { # insertion in the read sequence + $number_of_mismatches_1 -= $insertion_open; + $number_of_mismatches_1 -= $len[$_] * $insertion_extend; + # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; + } + elsif ($ops[$_] eq 'D') { # deletion in the read sequence + $number_of_mismatches_1 -= $deletion_open; + $number_of_mismatches_1 -= $len[$_] * $deletion_extend; + # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; + } + elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + else { + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + } + + # warn "Alignment score $number_of_mismatches_1\n"; + # print "Mismatches $number_of_mismatches_1\n\n"; + } + + ### CIGAR 2 + if ($cigar_2 =~ /(D|I)/) { + # warn "$cigar_2\n"; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar_2); # storing the length per operation + my @ops = split (/\d+/,$cigar_2); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach (0..$#len) { + if ($ops[$_] eq 'M') { + # warn "skipping\n"; + next; #irrelevant + } + elsif ($ops[$_] eq 'I') { # insertion in the read sequence + $number_of_mismatches_2 -= $insertion_open; + $number_of_mismatches_2 -= $len[$_] * $insertion_extend; + # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; + } + elsif ($ops[$_] eq 'D') { # deletion in the read sequence + $number_of_mismatches_2 -= $deletion_open; + $number_of_mismatches_2 -= $len[$_] * $deletion_extend; + # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; + } + elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + else { + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + } + } + + ### Now we have InDel corrected Alignment scores + + ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the + ### sequence contained more than 5 Ns, but this should occur close to never + + my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division + my $seq_2_N_count = $number_of_mismatches_2 % 6; + # warn "N count 1: $seq_1_N_count\n"; + # warn "N count 2: $seq_2_N_count\n"; + + $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count; + $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count; + + # warn "MM1 $number_of_mismatches_1 \n"; + # warn "MM2 $number_of_mismatches_2 \n"; + } + } + + #### + + my $XA_tag = "XA:Z:$number_of_mismatches_1"; + my $XB_tag = "XB:Z:$number_of_mismatches_2"; + + + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + ### optionally print number of non-bisulfite mismatches + if ($non_bs_mm){ + print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n"; + print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n"; + } + else{ # default + print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n"; + print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n"; + } +} + +sub revcomp{ + my $seq = shift or die "Missing seq to reverse complement\n"; + $seq = reverse $seq; + $seq =~ tr/ACTGactg/TGACTGAC/; + return $seq; +} + +sub hemming_dist{ + my $matches = 0; + my @actual_seq = split //,(shift @_); + my @ref_seq = split //,(shift @_); + + foreach (0..$#actual_seq){ + ++$matches if ($actual_seq[$_] eq $ref_seq[$_]); + } + return my $hd = scalar @actual_seq - $matches; +} + + +### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might +### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels +### 05 June 2014 + + +sub make_mismatch_string{ + my ($actual_seq,$ref_seq,$cigar,$md_sequence) = @_; + + my $MD_tag = "MD:Z:"; + my $prev_matching = 0; + my $last_char; + + my $ref_base; + my $actual_base; + + foreach my $pos ( 0..(length$actual_seq) - 1 ){ + + $actual_base = substr($actual_seq,$pos,1); + $ref_base = substr($ref_seq,$pos,1); + # if ($verbose){ warn "reference: $ref_base\tseen base: $actual_base\n";} + + if ( $actual_base eq $ref_base ){ + ++$prev_matching; + } + else{ + # If the mismatch is due to an insertion we simply move on, else we print the previously matching bases as well as the mismatching genomic base + if ($ref_base eq 'X'){ + # if ($verbose){ warn "The genome base was an artificually padded '$ref_base' due to an insertion in the read at this position. Just ignoring it for the MD tag\n"; sleep(1);} + } + else{ + # if ($verbose){ warn "previous matching bases: $prev_matching\n";} + + ### There is a mismatch between the sequence and the genome. First we need to write out how may bases matched until now + if ($prev_matching == 0){ + # if ($verbose){ warn "Got a mismatch either at the very start or next to another mismatch. Need to add a padding 0 as well as the mismatch\n";} + # if ($verbose){ warn "${prev_matching}$ref_base\n";} + $MD_tag .= $prev_matching; + $MD_tag .= $ref_base; + } + else{ + # if ($verbose){ warn "${prev_matching}$ref_base\n";} + $MD_tag .= $prev_matching; + $MD_tag .= $ref_base; + } + + $prev_matching = 0; # resetting $prev_matching + } + + } + + } + ### appending the number of matches one last time + $MD_tag .= $prev_matching; + + + ### If the read contains deletion(s) we need to take care of these in the MD-tag as well + if ($cigar =~ /D/){ + my $deletions_total = 0; + while ($cigar =~ /D/g){ + ++$deletions_total; + } + if ($verbose){ warn "Read contains $deletions_total deletions in total\n\n";} + + if ($verbose){ warn "There was a deletion in the read!\n";} + if ($verbose){ warn "actual:\t$actual_seq\nref:\t$ref_seq\nMD-seq:\t$md_sequence\nMD-tag: $MD_tag\n";} + + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my $MD_pos_so_far = 0; + my $deletions_processed = 0; + my $del_pos = 0; + my $deleted_bases = ''; + my $new_MD = $1 if ($MD_tag =~ /MD:Z:(.*)/); + my $md_index_already_processed; + + my @md = split //,$new_MD; + + if ($verbose){ warn "New MD-tag: $new_MD\n\n";} + $MD_tag = "MD:Z:"; ### reconstituting a new MD-tag + $new_MD = ''; # using this to build up a new string that will replace the old \@md + + if ($verbose){ warn "CIGAR string; $cigar\n";} + ### determining end position of a read + foreach my $index(0..$#len){ + + if ($ops[$index] eq 'M'){ # matching bases + $del_pos += $len[$index]; + if ($verbose){ warn "Operation is 'M', adding $len[$index] bp\n";} + } + elsif($ops[$index] eq 'I'){ # insertion + $del_pos += $len[$index]; + ### need to add insertions in the read to MD pos so far! + $MD_pos_so_far += $len[$index]; + if ($verbose){ warn "Operation is 'I', adding $len[$index] bp\n";} + } + elsif($ops[$index] eq 'D'){ # deletion + if ($verbose){ warn "Operation is 'D', extracting $len[$index] bp\n";} + $deleted_bases = substr($md_sequence,$del_pos,$len[$index]); + if ($verbose){ warn "Deleted bases: $deleted_bases\n\n";} + + ### Now we need to process the MD-tag so far and write out everything up until this point, inlcuding the deletion + if ($verbose){ warn "Now processing the MD-tag\n";} + my $op; + + my $this_deletion_processed; + my $md_processed_so_far; + my $current_md_index; + + foreach my $el (@md){ + + unless (defined $current_md_index){ + $current_md_index = 0; # first element = index 0 + } + else{ + ++$current_md_index; + } + + if ($md_index_already_processed and ($current_md_index <= $md_index_already_processed)){ + if ($verbose){ warn "This has to be another deletion within the same read. Currently processing index $current_md_index, but have already processed $md_index_already_processed indexes previously\n";} + $new_MD .= $el; + next; + } + + if ($verbose){ warn "Current element: $el\n";} + unless (defined $op){ # initialize + $op = $el; + if ($verbose){ warn "Initializing \$op as $op\n";} + next; + } + + if ($deletions_processed == $deletions_total){ + if ($verbose){ warn "Processed $deletions_processed in the read so far, out of $deletions_total total. Just appending elements until the end of the string: here $el\n";} + $MD_tag .= $el; + $new_MD .= $el; + next; + } + # this only occurs when there are more deletions in the read but we want to regenerate a new MD tag + if ($this_deletion_processed){ + $new_MD .= $el; + next; + } + + if ($op =~ /^\d+$/){ + if ($verbose){ warn "Operation so far was a digit: $op\n";} + if ($el =~ /\d/){ + $op .= $el; + if ($verbose){ warn "Appending current operation $el. New operation is: $op\n";} + next; + } + else{ + if ($verbose){ warn "current element is a word character: $el\n";} + + ### Need to determine if the matching operation length includes the deletion position + if ($verbose){ warn "Processing operation $op and adding it to MD pos which is so far: $MD_pos_so_far; deletion pos is $del_pos.\n";} + $MD_pos_so_far += $op; + if ($verbose){ warn "MD pos so far: $MD_pos_so_far\n";} + if ($MD_pos_so_far < $del_pos){ + if ($verbose){ warn "Doesn't cover the deletion yet. Writing back out.\n";} + $MD_tag .= $op; + $new_MD .= $op; + if ($verbose){ warn "Setting new operation to: $el\n";} + $op = $el; # setting new $op + } + else{ + if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";} + ### splitting up the number of matching bases in number before and after the deletion + + my $pos_after_deletion = $MD_pos_so_far - $del_pos; + my $pos_before_deletion = $op - $pos_after_deletion; + if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";} + $MD_tag .= "${pos_before_deletion}^${deleted_bases}"; + $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}"; + if ($verbose){ warn "\$newMD after adjusting for the current deletion: $new_MD\n";} + + #adjusting the MD_position by the number of bases after the deletion + $MD_pos_so_far -= $pos_after_deletion; + if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; } + ### also appending the current element because we are writing out the rest of the MD-string unchanged to $new_MD + $new_MD .= $el; + + $deletions_processed += 1; + $this_deletion_processed = 1; + + if ($deletions_processed == $deletions_total){ # this was the last deletion of the read + if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";} + $MD_tag .= "${pos_after_deletion}"; + + ### also appending the current element because we are writing out the rest of the MD-string unchanged + if ($verbose){ warn "also appending the current element $el\n";} + $MD_tag .= $el; + ### Finally also adding the length of the deletion to $del_pos + $del_pos += $len[$index]; + if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} + } + else{ + if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";} + if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";} + + + ### This migh need looking at!! + + $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op; + if ($verbose){ warn "Current index = $current_md_index\n";} + + if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";} + $md_index_already_processed = $current_md_index - 1; + + if ($verbose){ warn "Exiting now and waiting for the next deletion\n";} + + ### Finally also adding the length of the deletion to $del_pos + $del_pos += $len[$index]; + $MD_pos_so_far += $len[$index]; + if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} + #setting $op to en empty string so it is not being processed as the last element + $op = ''; + # last; # exiting the loop and processing the CIGAR string further until we hit the next deletion + } + } + } + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} + } + else{ + if ($verbose){ warn "Operation so far was a word character: $op\n";} + if ($el =~ /\d+/){ + # processing the previous mismatch position + $MD_tag .= $op; + $new_MD .= $op; + $MD_pos_so_far += length($op); + if ($verbose){ warn "Writing out mismatching base $op and adding length ",length($op),"\n";} + } + else{ + # this should never occur since mismatches are followed by a 0 or another digit + die "current element is a another word character: $el. This should never happen!\n"; + } + if ($verbose){ warn "Setting new operation to: $el\n";} + $op = $el; # setting new $op + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} + } + } + + ### need to consider last element if it was a digit or number and we are expecting the deletion in the last element of the MD-tag + if ($op =~ /\d+/ and $deletions_processed < $deletions_total){ + if ($verbose){ warn "\n\nlast operation was $op\n";} + if ($verbose){ warn "Processing operation $op; deletion pos is $del_pos. MD so far was: $MD_pos_so_far\n";} + + $MD_pos_so_far += $op; + if ($verbose){ warn "Adding $op to MD pos so far: $MD_pos_so_far\n";} + if ($verbose){ warn "Deletions already processed: $deletions_processed, del total: $deletions_total\n\n";} + if ($MD_pos_so_far >= $del_pos){ + if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";} + ### splitting up the number of matching bases in number before and after the deletion + + my $pos_after_deletion = $MD_pos_so_far - $del_pos; + my $pos_before_deletion = $op - $pos_after_deletion; + if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";} + + $MD_tag .= "${pos_before_deletion}^${deleted_bases}"; + $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}"; + + #adjusting the MD_position by the number of bases after the deletion + $MD_pos_so_far -= $pos_after_deletion; + if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; } + + $deletions_processed += 1; + $this_deletion_processed = 1; + + if ($deletions_processed == $deletions_total){ # this was the last deletion of the read + if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";} + $MD_tag .= "${pos_after_deletion}"; + + } + else{ + if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";} + if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";} + + $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op; + if ($verbose){ warn "Current index = $current_md_index\n";} + + if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";} + # since we are no longer in the loop we don't have to subtract 1 from $current_md_index (tit hasn't been incremented in the first place...) + $md_index_already_processed = $current_md_index; + + if ($verbose){ warn "Exiting now and waiting for the next deletion\n";} + + $MD_pos_so_far += $len[$index]; + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} + } + ### Finally also adding the length of the deletion to $del_pos + $del_pos += $len[$index]; + if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} + } + else{ + die "Something went wrong, we haven't seen a deletion so far even though we should have...\n\n"; + } + } + + # forming a new @md + @md = split //,$new_MD; + $new_MD = ''; + if ($verbose){ warn "New \@md array: @md\n\n";} + if ($verbose){ warn "MD-tag so far: $MD_tag ~~\nnew_MD so far: $new_MD\n\n";} + + } + else{ + die "Found CIGAR operations other than M, I, D or N: '$ops[$index]'. Not allowed at the moment\n"; + } + } + + } + if ($verbose){ warn "Returning MD-tag: $MD_tag\n";} + return $MD_tag; + +} + +### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might +### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels +# sub make_mismatch_string{ +# my $actual_seq = shift or die "Missing actual sequence\n"; +# my $ref_seq = shift or die "Missing reference sequence\n"; +# my $XX_tag = "XX:Z:"; + +# my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison + +# warn "'$tmp'\n"; sleep(1); +# my $prev_mm_pos = 0; + +# while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference +# my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch +# my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read +# $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other +# $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation) +# $prev_mm_pos = pos($tmp); # Position of last mismatch +# } +# my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence +# $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence +# return $XX_tag; +# } + + + +sub print_helpfile{ + print << "HOW_TO"; + + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + + +DESCRIPTION + + +The following is a brief description of command line options and arguments to control the Bismark +bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the +reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand +version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). +Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome +(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the +forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) +are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original +sequence from the genome and determine if there were any protected C's present or not. + +As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be +re-enabled by using --non_directional. + +The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old +'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best +alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below. + + +USAGE: bismark [options] {-1 -2 | } + + +ARGUMENTS: + + The path to the folder containing the unmodified reference genome + as well as the subfolders created by the Bismark_Genome_Preparation + script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/). + Bismark expects one or more fastA files in this folder (file extension: .fa + or .fasta). The path can be relative or absolute. + +-1 Comma-separated list of files containing the #1 mates (filename usually includes + "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must + correspond file-for-file and read-for-read with those specified in . + Reads may be a mix of different lengths. Bismark will produce one mapping result + and one report file per paired-end input file pair. + +-2 Comma-separated list of files containing the #2 mates (filename usually includes + "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must + correspond file-for-file and read-for-read with those specified in . + Reads may be a mix of different lengths. + + A comma- or space-separated list of files containing the reads to be aligned (e.g. + lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will + produce one mapping result and one report file per input file. + + +OPTIONS: + + +Input: + +-q/--fastq The query input files (specified as , or are FASTQ + files (usually having extension .fg or .fastq). This is the default. See also + --solexa-quals. + +-f/--fasta The query input files (specified as , or are FASTA + files (usually havin extension .fa, .mfa, .fna or similar). All quality values + are assumed to be 40 on the Phred scale. FASTA files are expected to contain both + the read name and the sequence on a single line (and not spread over several lines). + +-s/--skip Skip (i.e. do not align) the first reads or read pairs from the input. + +-u/--upto Only aligns the first reads or read pairs from the input. Default: no limit. + +--phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. + +--phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. + +--solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled + (which can't). The formula for conversion is: + phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This + is usually the right option for use with (unconverted) reads emitted by the GA + Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. + +--solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) + reads emitted by GA Pipeline version 1.3 or later. Default: off. + +--path_to_bowtie The full path to the Bowtie (1 or 2) installation on your system. If not + specified it is assumed that Bowtie (1 or 2) is in the PATH. + + +Alignment: + +-n/--seedmms The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs + of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the + default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). + +-l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to + which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for + larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). + +-e/--maqerr Maximum permitted total of quality values at all mismatched read positions throughout + the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds + quality values to the nearest 10 and saturates at 30. This value is not relevant for + Bowtie 2. + +--chunkmbs The number of megabytes of memory a given thread is given to store path descriptors in + --best mode. Best-first search must keep track of many paths at once to ensure it is + always extending the path with the lowest cumulative cost. Bowtie tries to minimize the + memory impact of the descriptors, but they can still grow very large in some cases. If + you receive an error message saying that chunk memory has been exhausted in --best mode, + try adjusting this parameter up to dedicate more memory to the descriptors. This value + is not relevant for Bowtie 2. Default: 512. + +-I/--minins The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and + a paired-end alignment consists of two 20-bp alignments in the appropriate orientation + with a 20-bp gap between them, that alignment is considered valid (as long as -X is also + satisfied). A 19-bp gap would not be valid in that case. Default: 0. + +-X/--maxins The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and + a paired-end alignment consists of two 20-bp alignments in the proper orientation with a + 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). + A 61-bp gap would not be valid in that case. Default: 500. + +--multicore Sets the number of parallel instances of Bismark to be run concurrently. This forks the + Bismark alignment step very early on so that each individual Spawn of Bismark processes + only every n-th sequence (n being set by --multicore). Once all processes have completed, + the individual BAM files, mapping reports, unmapped or ambiguous FastQ files are merged + into single files in very much the same way as they would have been generated running Bismark + conventionally with only a single instance. + + If system resources are plentiful this is a viable option to speed up the alignment process + (we observed a near linear speed increase for up to --multicore 8 tested). However, please note + that a typical Bismark run will use several cores already (Bismark itself, 2 or 4 threads of + Bowtie/Bowtie2, Samtools, gzip etc...) and ~10-16GB of memory depending on the choice of aligner + and genome. WARNING: Bismark Parallel (BP?) is resource hungry! Each value of --multicore specified + will effectively lead to a linear increase in compute and memory requirements, so --multicore 4 for + e.g. the GRCm38 mouse genome will probably use ~20 cores and eat ~40GB or RAM, but at the same time + reduce the alignment time to ~25-30%. You have been warned. + + + +Bowtie 1 Reporting: + +-k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option + will be used by default. + +--best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum + (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in + terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred + quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both + have Phred quality 10. When --best is not specified, Bowtie may report alignments that + are sub-optimal in terms of stratum and/or quality (though an effort is made to report + the best alignment). --best mode also removes all strand bias. Note that --best does not + affect which alignments are considered "valid" by Bowtie, only which valid alignments + are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified. + Default: on. + +--no_best Disables the --best option which is on by default. This can speed up the alignment process, + e.g. for testing purposes, but for credible results it is not recommended to disable --best. + + +Output: + +--non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four + bisulfite strands will be reported. Default: OFF. + + (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary + to the original strands are merely theoretical and should not exist in reality. Specifying directional + alignments (which is the default) will only run 2 alignment threads to the original top (OT) + or bottom (OB) strands in parallel and report these alignments. This is the recommended option + for sprand-specific libraries). + +--pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al., + PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode, + as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT + and OB ones. Use this option only if you are certain that your libraries were constructed following + a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option + --pbat works only for FastQ files (in both Bowtie and Bowtie 2 mode) and using uncompressed + temporary files only). + +--sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are + split up into several smaller files to run concurrently and the output files are to be merged. + +--quiet Print nothing besides alignments. + +--vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead + of SAM format output. + +-un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will + appear as they did in the input, without any translation of quality values that may have + taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 + and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads + with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) + are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. + +--ambiguous Write all reads which produce more than one valid alignment with the same number of lowest + mismatches or other reads that fail to align uniquely to a file in the output directory. + Written reads will appear as they did in the input, without any of the translation of quality + values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two + parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and + _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. + +-o/--output_dir Write all output files into this directory. By default the output files will be written into + the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt + to create it first. The path to the output folder can be either relative or absolute. + +--temp_dir Write temporary files to this directory instead of into the same directory as the input files. If + the specified folder does not exist, Bismark will attempt to create it first. The path to the + temporary folder can be either relative or absolute. + +--non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the + alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is + just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions. + The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches' + and 'XB:Z:number of mismatches' for read 2 of paired-end reads. + +--gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk + space. This option is available for most alignment modes but is not available for paired-end FastA + files. This option might be somewhat slower than writing out uncompressed files, but this awaits + further testing. + +--sam The output will be written out in SAM format instead of the default BAM format. Bismark will + attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't + been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found, + the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). + +--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified + explicitly if Samtools is in the PATH already. + +--prefix Prefixes to the output filenames. Trailing dots will be replaced by a single one. For + example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. + +-B/--basename Write all output to files starting with this base file name. For example, '--basename foo' + would result in the files 'foo.sam' and 'foo_SE_report.txt' (or its paired-end equivalent). Takes + precedence over --prefix. + +--old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, + this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since + both the appended read IDs and custom FLAG values may cause problems with some downstream tools + such as Picard, new defaults were implemented as of version 0.8.3. + + + default old_flag + =================== =================== + Read 1 Read 2 Read 1 Read 2 + + OT: 99 147 67 131 + + OB: 83 163 115 179 + + CTOT: 99 147 67 131 + + CTOB: 83 163 115 179 + + +Other: + +-h/--help Displays this help file. + +-v/--version Displays version information. + + +BOWTIE 2 SPECIFIC OPTIONS + +--bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end + alignments, i.e. searches for alignments involving all read characters (also called + untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter + and/or quality trimmed where appropriate. Both small (.bt2) and large (.bt2l) Bowtie 2 + indexes are supported. Default: off. + +Bowtie 2 alignment options: + +-N Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. + Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) + but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for + Bowtie 1 see -n). + +-L Sets the length of the seed substrings to align during multiseed alignment. Smaller values + make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is + used by default, which sets -L to 20. This option is only available for Bowtie 2 (for + Bowtie 1 see -l). + +--ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched + position to be the highest possible, regardless of the actual value. I.e. input is treated + as though all quality values are high. This is also the default behavior when the input + doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. + + +Bowtie 2 paired-end options: + +--no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if + it cannot find a concordant or discordant alignment for a pair. This option is invariable and + and on by default. + +--no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. + A discordant alignment is an alignment where both mates align uniquely, but that does not + satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior + and it is on by default. + + +Bowtie 2 effort options: + +-D Up to consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using + the alignments found so far. A seed extension "fails" if it does not yield a new best or a + new second-best alignment. Default: 15. + +-R is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. + When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of + mismatches allowed) at different offsets and searches for more alignments. A read is considered + to have repetitive seeds if the total number of seed hits divided by the number of seeds + that aligned at least once is greater than 300. Default: 2. + +Bowtie 2 parallelization options: + + +-p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores + and synchronize when parsing reads and outputting alignments. Searching for alignments is highly + parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. + E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint + by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads + library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will + automatically use the option '--reorder', which guarantees that output SAM records are printed in + an order corresponding to the order of the reads in the original input file, even when -p is set + greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and + setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then + if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally + correspond to input order in that case. + +Bowtie 2 Scoring options: + +--score_min Sets a function governing the minimum alignment score needed for an alignment to be considered + "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying + L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. + See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is + L,0,-0.2. + +--rdg , Sets the read gap open () and extend () penalties. A read gap of length N gets a penalty + of + N * . Default: 5, 3. + +--rfg , Sets the reference gap open () and extend () penalties. A reference gap of length N gets + a penalty of + N * . Default: 5, 3. + + +Bowtie 2 Reporting options: + +-most_valid_alignments This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is + deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the + default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the + effort expended to find valid alignments. + + For reference, this used to be the old (now deprecated) description of -M: + Bowtie 2 searches for at most +1 distinct, valid alignments for each read. The search terminates when it + can't find more distinct valid alignments, or when it finds +1 distinct alignments, whichever + happens first. Only the best alignment is reported. Information from the other alignments is used to + estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes + Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that + aligns many places. For reads that have more than +1 distinct, valid alignments, Bowtie 2 does not + guarantee that the alignment reported is the best possible in terms of alignment score. -M is + always used and its default value is set to 10. + + +'VANILLA' Bismark OUTPUT: + +Single-end output format (tab-separated): + + (1) + (2) + (3) + (4) + (5) + (6) + (7) + (8) + (9) +(11) + + +Paired-end output format (tab-separated): + (1) + (2) + (3) + (4) + (5) + (6) + (7) + (8) + (9) +(10) +(11) +(12) +(14) +(15) + + +Bismark SAM OUTPUT (default): + + (1) QNAME (seq-ID) + (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!)) + (3) RNAME (chromosome) + (4) POS (start position) + (5) MAPQ (always 255 for use with Bowtie) + (6) CIGAR + (7) RNEXT + (8) PNEXT + (9) TLEN +(10) SEQ +(11) QUAL (Phred33 scale) +(12) NM-tag (edit distance to the reference) +(13) MD-tag (base-by-base mismatches to the reference (handles indels) +(14) XM-tag (methylation call string) +(15) XR-tag (read conversion state for the alignment) +(16) XG-tag (genome conversion state for the alignment) +(17) XA/XB-tag (non-bisulfite mismatches) (optional!) + +Each read of paired-end alignments is written out in a separate line in the above format. + + +Last edited on 06 May 2015. + +HOW_TO +} diff -r 0f8646f22b8d -r fcadce4d9a06 new/bismark_genome_preparation --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/new/bismark_genome_preparation Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,468 @@ +#!/usr/bin/perl -- +use strict; +use warnings; +use Cwd; +# use File::Path qw(rmtree); +$|++; + + +## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +use Getopt::Long; +use Cwd; + +my $verbose; +my $help; +my $version; +my $man; +my $path_to_bowtie; +my $multi_fasta; +my $single_fasta; +my $bowtie2; + +my $bismark_version = 'v0.14.3'; + +GetOptions ('verbose' => \$verbose, + 'help' => \$help, + 'man' => \$man, + 'version' => \$version, + 'path_to_bowtie:s' => \$path_to_bowtie, + 'single_fasta' => \$single_fasta, + 'bowtie2' => \$bowtie2, + ); + +if ($help or $man){ + print_helpfile(); + exit; +} + +if ($version){ + print << "VERSION"; + + Bismark - Bisulfite Mapper and Methylation Caller. + + Bismark Genome Preparation Version: $bismark_version + Copyright 2010-15 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/ + +VERSION + exit; +} + +my $genome_folder = shift @ARGV; # mandatory +my %chromosomes; # checking if chromosome names are unique (required) + +# Ensuring a genome folder has been specified +if ($genome_folder){ + unless ($genome_folder =~ /\/$/){ + $genome_folder =~ s/$/\//; + } + $verbose and print "Path to genome folder specified as: $genome_folder\n"; + chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; + + # making the genome folder path abolsolute so it won't break if the path was specified relative + $genome_folder = getcwd; + unless ($genome_folder =~ /\/$/){ + $genome_folder =~ s/$/\//; + } +} +else{ + die "Please specify a genome folder to be used for bisulfite conversion\n\n"; +} + + +my $CT_dir; +my $GA_dir; + + +if ($single_fasta){ + print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n"; + $multi_fasta = 0; +} +else{ + print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n"; + $single_fasta = 0; + $multi_fasta = 1; +} + +my @filenames = create_bisulfite_genome_folders(); + +process_sequence_files (); + +launch_bowtie_indexer(); + +sub launch_bowtie_indexer{ + if ($bowtie2){ + print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n"; + } + else{ + print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n"; + } + print "Please be aware that this process can - depending on genome size - take up to several hours!\n"; + sleep(5); + + ### if the path to bowtie was specfified explicitely + if ($path_to_bowtie){ + if ($bowtie2){ + $path_to_bowtie =~ s/$/bowtie2-build/; + } + else{ + $path_to_bowtie =~ s/$/bowtie-build/; + } + } + ### otherwise we assume that bowtie-build is in the path + else{ + if ($bowtie2){ + $path_to_bowtie = 'bowtie2-build'; + } + else{ + $path_to_bowtie = 'bowtie-build'; + } + } + + $verbose and print "\n"; + + ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer) + my $pid = fork(); + + # parent process + if ($pid){ + sleep(1); + chdir $CT_dir or die "Unable to change directory: $!\n"; + $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n"; + $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n"; + + sleep (11); + exec ("$path_to_bowtie","-f","$file_list","BS_CT"); + } + + # child process + elsif ($pid == 0){ + sleep(2); + chdir $GA_dir or die "Unable to change directory: $!\n"; + $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n"; + $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n"; + $verbose and print "(starting in 10 seconds)\n"; + sleep(10); + exec ("$path_to_bowtie","-f","$file_list","BS_GA"); + } + + # if the platform doesn't support the fork command we will run the indexing processes one after the other + else{ + print "Forking process was not successful, therefore performing the indexing sequentially instead\n"; + sleep(10); + + ### moving to CT genome folder + $verbose and warn "Preparing to index CT converted genome in $CT_dir\n"; + chdir $CT_dir or die "Unable to change directory: $!\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "$file_list\n\n"; + sleep(2); + system ("$path_to_bowtie","-f","$file_list","BS_CT"); + @fasta_files=(); + $file_list= ''; + + ### moving to GA genome folder + $verbose and warn "Preparing to index GA converted genome in $GA_dir\n"; + chdir $GA_dir or die "Unable to change directory: $!\n"; + @fasta_files = <*.fa>; + $file_list = join (',',@fasta_files); + $verbose and print "$file_list\n\n"; + sleep(2); + exec ("$path_to_bowtie","-f","$file_list","BS_GA"); + } +} + + +sub process_sequence_files { + + my ($total_CT_conversions,$total_GA_conversions) = (0,0); + $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n"; + sleep (3); + + $verbose and print "conversions performed:\n"; + $verbose and print join("\t",'chromosome','C->T','G->A'),"\n"; + + + ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file + ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle + ### This is now the default option + + if ($multi_fasta){ + ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files + my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa"; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa"; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + foreach my $filename(@filenames){ + my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); + open (IN,$filename) or die "Failed to read from sequence file $filename $!\n"; + # warn "Reading chromosome information from $filename\n\n"; + + ### first line needs to be a fastA header + my $first_line = ; + chomp $first_line; + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + ### Exiting if a chromosome with the same name was present already + if (exists $chromosomes{$chromosome_name}){ + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else{ + $chromosomes{$chromosome_name}++; + } + + ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. + unless ($multi_fasta){ + my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; + $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; + $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry + print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry + + + while (){ + + ### in case the line is a new fastA header + if ($_ =~ /^>/){ + ### printing out the stats for the previous chromosome + $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; + ### resetting the chromosome transliteration counters + ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); + + ### Extracting chromosome name from the additional FastA header + $chromosome_name = extract_chromosome_name($_); + + ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. + unless ($multi_fasta){ + my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; + $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; + $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; + print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; + } + + else{ + my $sequence = uc$_; + + ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group) + + $sequence =~ s/[^ATCGN\n\r]/N/g; + + ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion) + + my $CT_sequence = $sequence; + my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts + $total_CT_conversions += $CT_transliterations_performed; + $chromosome_CT_conversions += $CT_transliterations_performed; + + print CT_CONVERT $CT_sequence; + + ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse- + ### complementing the forward strand and then C->T converting it) + + my $GA_sequence = $sequence; + my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand + $total_GA_conversions += $GA_transliterations_performed; + $chromosome_GA_conversions += $GA_transliterations_performed; + + print GA_CONVERT $GA_sequence; + + } + } + $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; + } + close (CT_CONVERT) or die "Failed to close filehandle: $!\n"; + close (GA_CONVERT) or die "Failed to close filehandle: $!\n"; + + + print "\nTotal number of conversions performed:\n"; + print "C->T:\t$total_CT_conversions\n"; + print "G->A:\t$total_GA_conversions\n"; + + warn "\nStep II - Genome bisulfite conversions - completed\n\n\n"; +} + +sub extract_chromosome_name { + + my $header = shift; + + ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well + + if ($header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$header); + return $chromosome_name; + } + else{ + die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n"; + } +} + +sub create_bisulfite_genome_folders{ + + $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n"; + + if ($path_to_bowtie){ + unless ($path_to_bowtie =~ /\/$/){ + $path_to_bowtie =~ s/$/\//; + } + if (chdir $path_to_bowtie){ + if ($bowtie2){ + $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n"; + } + else{ + $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n"; + } + } + else{ + die "There was an error with the path to bowtie: $!\n"; + } + } + + chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; + + + # Exiting unless there are fastA files in the folder + my @filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@filenames){ + @filenames = <*.fasta>; + } + + unless (@filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n"; + } + + warn "Bisulfite Genome Indexer version $bismark_version (last modified 19 Sept 2013)\n\n"; + sleep (3); + + # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists + my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/"; + unless (-d $bisulfite_dir){ + mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n"; + } + else{ + print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n"; + sleep(5); + } + + chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n"; + $CT_dir = "${bisulfite_dir}CT_conversion/"; + $GA_dir = "${bisulfite_dir}GA_conversion/"; + + # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion) + # converted version of the genome + unless (-d $CT_dir){ + mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $CT_dir\n"; + } + unless (-d $GA_dir){ + mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $GA_dir\n"; + } + + # moving back to the original genome folder + chdir $genome_folder or die "Could't move to directory $genome_folder $!"; + # $verbose and print "Moved back to genome folder folder $genome_folder\n"; + warn "\nStep I - Prepare genome folders - completed\n\n\n"; + return @filenames; +} + +sub print_helpfile{ + print << 'HOW_TO'; + + +DESCRIPTION + +This script is supposed to convert a specified reference genome into two different bisulfite +converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first +bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs +converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference +genome folder. Once the bisulfite conversion has been completed the program will fork and launch +two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware +that the indexing process can take up to several hours; this will mainly depend on genome size +and system resources. + + + +The following is a brief description of command line options and arguments to control the +Bismark Genome Preparation: + + +USAGE: bismark_genome_preparation [options] + + +OPTIONS: + +--help/--man Displays this help filea and exits. + +--version Displays version information and exits. + +--verbose Print verbose output for more details or debugging. + +--path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system + (depending on which aligner/indexer you intend to use). Unless this path + is specified it is assumed that Bowtie is in the PATH. + +--bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1). + +--single_fasta Instruct the Bismark Indexer to write the converted genomes into + single-entry FastA files instead of making one multi-FastA file (MFA) + per chromosome. This might be useful if individual bisulfite converted + chromosomes are needed (e.g. for debugging), however it can cause a + problem with indexing if the number of chromosomes is vast (this is likely + to be in the range of several thousand files; the operating system can + only handle lists up to a certain length, and some newly assembled + genomes may contain 20000-50000 contigs of scaffold files which do exceed + this list length limit). + + +ARGUMENTS: + + The path to the folder containing the genome to be bisulfite converted. + The Bismark Genome Preparation expects one or more fastA files in the folder + (with the file extension: .fa or .fasta). Specifying this path is mandatory. + + +This script was last modified on 16 Oct 2014. +HOW_TO +} diff -r 0f8646f22b8d -r fcadce4d9a06 new/bismark_methylation_extractor --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/new/bismark_methylation_extractor Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,5875 @@ +#!/usr/bin/perl +use warnings; +use strict; +$|++; +use Getopt::Long; +use Cwd; +use Carp; +use FindBin qw($Bin); +use lib "$Bin/../lib"; + +## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +my @filenames; # input files +my %counting; +my $parent_dir = getcwd(); + +my %fhs; + +my $version = 'v0.14.3'; +my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore) = process_commandline(); + + +### only needed for bedGraph output +my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files +my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total +my @bedfiles; + +### only needed for genome-wide cytosine methylation report +my %chromosomes; + +my %mbias_1; +my %mbias_2; + + +############################################################################################## +### Summarising Run Parameters +############################################################################################## + +### METHYLATION EXTRACTOR + +warn "Summarising Bismark methylation extractor parameters:\n"; +warn '='x63,"\n"; + +if ($single){ + if ($vanilla){ + warn "Bismark single-end vanilla format specified\n"; + } + else{ + warn "Bismark single-end SAM format specified (default)\n"; # default + } +} +elsif ($paired){ + if ($vanilla){ + warn "Bismark paired-end vanilla format specified\n"; + } + else{ + warn "Bismark paired-end SAM format specified (default)\n"; # default + } +} + +warn "Number of cores to be used: $multicore\n"; + +if ($single){ + if ($ignore){ + warn "First $ignore bp will be disregarded when processing the methylation call string\n"; + } + if ($ignore_3prime){ + warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string\n"; + } + +} +else{ ## paired-end + if ($ignore){ + warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n"; + } + if ($ignore_r2){ + warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; + } + + if ($ignore_3prime){ + warn "Last $ignore_3prime bp will be disregarded when processing the methylation call string of Read 1\n"; + } + if ($ignore_3prime_r2){ + warn "Last $ignore_3prime_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; + } + + +} + + +if ($full){ + warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; +} +if ($merge_non_CpG){ + warn "Merge CHG and CHH context to non-CpG context specified\n"; +} +### output directory +if ($output_dir eq ''){ + warn "Output will be written to the current directory ('$parent_dir')\n"; +} +else{ + warn "Output path specified as: $output_dir\n"; +} + + +sleep (1); + +### BEDGRAPH + +if ($bedGraph){ + warn "\n\nSummarising bedGraph parameters:\n"; + warn '='x63,"\n"; + + if ($counts){ + warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t \ncoverage format:\t \n\n"; + } + else{ + warn "Generating additional sorted output in bedGraph format (output format: )\n"; + } + + ### Zero-based coordinates + if ($zero){ + warn "Writing out an additional coverage file (ending in zero.cov) with 0-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n"; + } + + warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; + + if ($CX_context){ + warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; + } + + if ($remove){ + warn "White spaces in read ID names will be removed prior to sorting\n"; + } + + if ($ample_mem){ + warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n"; + } + elsif (defined $sort_size){ + warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; + } + else{ + warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; + } + + + + sleep (1); + + if ($cytosine_report){ + warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; + warn '='x63,"\n"; + warn "Generating comprehensive genome-wide cytosine report\n(output format: )\n"; + + + if ($CX_context){ + warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting cytosine methylation in CpG context only (default)\n"; + } + + if ($split_by_chromosome){ + warn "Splitting the cytosine report output up into individual files for each chromosome\n"; + } + + ### Zero-based coordinates + if ($zero){ + warn "Using zero-based start and 1-based end genomic coordinates (zero-based, half-open; user-defined)\n"; + } + else{ # default, 1-based coords + warn "Using 1-based genomic coordinates (default)\n"; + } + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + warn "Genome folder was specified as $genome_folder\n"; + } + else{ + $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; + warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; + } + sleep (1); + } +} + +warn "\n"; +sleep (1); + +###################################################### +### PROCESSING FILES +###################################################### + +foreach my $filename (@filenames){ + # resetting counters and filehandles + %fhs = (); + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + methylation_call_strings => 0, + ); + + @sorting_files = (); + @bedfiles = (); + + %mbias_1 = (); + %mbias_2 = (); + + + ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor + unless ($vanilla){ + if ($paired){ + test_positional_sorting($filename); + } + } + + my ($pid,$pids,$report_basename) = process_Bismark_results_file($filename); + + if ($pid == 0){ + warn "Finished processing child process. Exiting..\n"; + + # ### Closing all filehandles of the child process so that the Bismark methylation extractor output doesn't get truncated due to buffering issues + # foreach my $fh (keys %fhs) { + # if ($fh =~ /^[1230]$/) { + # foreach my $context (keys %{$fhs{$fh}}) { + # $fhs{$fh}->{$context}->flush; + # } + # } + # else{ + # $fhs{$fh}->flush; + # } + # } + exit 0; + } + + ### + if ($pid and $multicore > 1){ + warn "Now waiting for all child processes to complete\n"; + sleep(1); + + ### we need to ensure that we wait for all child processes to be finished before continuing + # warn "here are the child IDs: @$pids\n"; + # warn "Looping through the child process IDs:\n"; + + foreach my $id (@$pids){ + # print "$id\t"; + my $kid = waitpid ($id,0); + # print "Returned: $kid\nExit status: $?\n"; + unless ($? == 0){ + warn "\nChild process terminated with exit signal: '$?'\n\n"; + } + } + } + + ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues + foreach my $fh (keys %fhs) { + if ($fh =~ /^[1230]$/) { + foreach my $context (keys %{$fhs{$fh}}) { + close $fhs{$fh}->{$context} or die $!; + } + } + else{ + close $fhs{$fh} or die $!; + } + } + + ### We need to stitch together a main splitting report from all individual parent/child processes + if ($multicore > 1){ + merge_individual_splitting_reports($report_basename); + print_splitting_report(); + + merge_individual_mbias_reports($report_basename); # this updates the main %mbias_1 and %mbias_2 data structures so we can proceed normally + + } + + unless ($mbias_off){ + ### printing out all M-Bias data + produce_mbias_plots ($filename); produce_mbias_plots ($filename); + + } + + unless ($mbias_only){ + delete_unused_files(); + } + + if ($bedGraph){ + + my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $out =~ s/gz$//; + $out =~ s/sam$//; + $out =~ s/bam$//; + $out =~ s/txt$//; + $out =~ s/$/bedGraph/; + + my $bedGraph_output = $out; + my @args; + + if ($remove){ + push @args, '--remove'; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($no_header){ + push @args, '--no_header'; + } + if ($gazillion){ + push @args, '--gazillion'; + } + if ($ample_mem){ + push @args, '--ample_memory'; + } + if ($zero){ + push @args, "--zero"; + } + + # if ($counts){ + # push @args, "--counts"; + # } + + push @args, "--buffer_size $sort_size"; + push @args, "--cutoff $coverage_threshold"; + push @args, "--output $bedGraph_output"; + push @args, "--dir '$output_dir'"; + + ### adding all files to be sorted to @args + foreach my $f (@sorting_files){ + push @args, $f; + } + + # print join "\t",@args,"\n"; + + system ("$Bin/bismark2bedGraph @args"); + + warn "Finished BedGraph conversion ...\n\n"; + sleep(1); + + # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!"; + # warn "Writing bedGraph to file: $bedGraph_output\n"; + # process_bedGraph_output(); + # close OUT or die $!; + + ### genome-wide cytosine methylation report requires bedGraph processing anyway + if ($cytosine_report){ + + @args = (); # resetting @args + my $cytosine_out = $out; + $cytosine_out =~ s/bedGraph$//; + + if ($CX_context){ + $cytosine_out =~ s/$/CX_report.txt/; + } + else{ + $cytosine_out =~ s/$/CpG_report.txt/; + } + + push @args, "--output $cytosine_out"; + push @args, "--dir '$output_dir'"; + push @args, "--genome '$genome_folder'"; + push @args, "--parent_dir '$parent_dir'"; + + if ($zero){ + push @args, "--zero"; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($split_by_chromosome){ + push @args, '--split_by_chromosome'; + } + + my $coverage_output = $bedGraph_output; + $coverage_output =~ s/bedGraph$/bismark.cov/; + + push @args, $output_dir . $coverage_output; # this will be the infile + + system ("$Bin/coverage2cytosine @args"); + # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); + warn "\n\nFinished generating genome-wide cytosine report\n\n"; + } + } +} + +sub merge_individual_splitting_reports{ + + my $report_basename = shift; + + my @splitting_reports; # only needed in multi-core mode to generate an overall report + foreach my $ext (1..$multicore){ + push @splitting_reports, "$report_basename.$ext"; + } + warn "\nMerging individual splitting reports into overall report: '$report_basename'\n"; + warn "Merging from these individual files:\n"; + print join ("\n",@splitting_reports),"\n\n"; + sleep(1); + + ########## + # resetting the counter first + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + methylation_call_strings => 0, + ); + + # repopulating the merged counter + foreach my $file (@splitting_reports){ + open (IR,$file) or die $!; + while (){ + chomp; + my ($context,$count) = (split /\t/); + if ($context){ + if ($context =~ /^Total C to T conversions in CpG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_unmethylated_CpG_count} += $count; + } + elsif ($context =~ /^Total C to T conversions in CHG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_unmethylated_CHG_count} += $count; + } + elsif ($context =~ /^Total C to T conversions in CHH context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_unmethylated_CHH_count} += $count; + } + elsif ($context =~ /^Total methylated C\'s in CpG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_meCpG_count} += $count; + } + elsif ($context =~ /^Total methylated C\'s in CHG context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_meCHG_count} += $count; + } + elsif ($context =~ /^Total methylated C\'s in CHH context/){ + # warn "context: $context\ncount: $count\n"; + $counting{total_meCHH_count} += $count; + } + elsif ($context =~ /^line count/){ + # warn "Line count\ncount: $count\n"; + $counting{sequences_count} = $count; # always the same + } + elsif ($context =~ /^meth call strings/){ + # warn "Meth call strings\ncount: $count\n"; + $counting{methylation_call_strings} += $count; + } + } + } + } + + # deleting the individual reports afterwards + foreach my $file (@splitting_reports){ + unlink $file; + } +} + + +sub merge_individual_mbias_reports{ + + my $report_basename = shift; + + my @mbias_reports; # only needed in multi-core mode to generate an overall report + foreach my $ext (1..$multicore){ + push @mbias_reports, "$report_basename.${ext}.mbias"; + } + warn "\nMerging individual M-bias reports into overall M-bias statistics from these $multicore individual files:\n"; + print join ("\n",@mbias_reports),"\n\n"; + + + ########## + # resetting the counters first, then repopulating them + %mbias_1 = (); + %mbias_2 = (); + + # repopulating the merged counter + foreach my $file (@mbias_reports){ + open (IR,$file) or die $!; + + my $context; + my $read; + + while (){ + chomp; + # warn "$_\n"; sleep(1); + if ($_ =~ /context/){ + $context = $1 if ($_ =~ /(\D{3}) context/); + # warn "Context set as $context\n"; + + if ($_ =~ /R2/){ + $read = 'R2'; + } + else{ + $read = 'R1'; + } + # warn "Setting read identity to '$read'\n"; + + # reading in 2 additional lines (===========, and header line) + $_ = ; + #warn "discarding line $_\n"; + $_ = ; + #warn "discarding line $_\n"; + next; + } + else{ + + if ($_ eq ''){ + # empty line, only occurs after a context has finished and before a new context starts + next; + } + + my ($pos,$meth,$unmeth) = (split /\t/); + # warn "$pos\t$meth\t$unmeth\n"; sleep(1); + if ($read eq 'R1'){ + $mbias_1{$context}->{$pos}->{meth} += $meth; + $mbias_1{$context}->{$pos}->{un} += $unmeth; + } + elsif ($read eq 'R2'){ + $mbias_2{$context}->{$pos}->{meth} += $meth; + $mbias_2{$context}->{$pos}->{un} += $unmeth; + } + } + } + close IR or warn "Had trouble closing filehandle for $file: $!\n"; + } + + # deleting the individual reports afterwards + foreach my $file (@mbias_reports){ + unlink $file; + } +} + + +sub delete_unused_files{ + + warn "Deleting unused files ...\n\n"; sleep(1); + + my $index = 0; + + while ($index <= $#sorting_files){ + if ($sorting_files[$index] =~ /gz$/){ + open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; + } + else{ + open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; + } + + my $used = 0; + + while (){ + next if (/^Bismark/); + if ($_){ + $used = 1; + last; + } + } + + if ($used){ + warn "$sorting_files[$index] contains data ->\tkept\n"; + ++$index; + } + else{ + + my $delete = unlink $sorting_files[$index]; + + if ($delete){ + warn "$sorting_files[$index] was empty ->\tdeleted\n"; + } + else{ + warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n" + } + + ### we also need to remove the element from @sorting_files + splice @sorting_files, $index, 1; + } + } + warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately) +} + +sub produce_mbias_plots{ + + my $filename = shift; + + my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $mbias =~ s/gz$//; + $mbias =~ s/sam$//; + $mbias =~ s/bam$//; + $mbias =~ s/txt$//; + my $mbias_graph_1 = my $mbias_graph_2 = $mbias; + $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png'; + $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png'; + + $mbias =~ s/$/M-bias.txt/; + + open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n"; + + # determining maximum read length + my $max_length_1 = 0; + my $max_length_2 = 0; + + foreach my $context (keys %mbias_1){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ + $max_length_1 = $pos unless ($max_length_1 >= $pos); + } + } + if ($paired){ + foreach my $context (keys %mbias_2){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ + $max_length_2 = $pos unless ($max_length_2 >= $pos); + } + } + } + + if ($single){ + warn "Determining maximum read length for M-Bias plot\n"; + warn "Maximum read length of Read 1: $max_length_1\n\n"; + } + else{ + warn "Determining maximum read lengths for M-Bias plots\n"; + warn "Maximum read length of Read 1: $max_length_1\n"; + warn "Maximum read length of Read 2: $max_length_2\n\n"; + } + # sleep(3); + + my @mbias_read1; + my @mbias_read2; + + #Check whether the module GD::Graph:lines is installed + my $gd_graph_installed = 0; + eval{ + require GD::Graph::lines; + GD::Graph::lines->import(); + }; + + unless($@) { # syntax or routine error variable, set if something goes wrong in the last eval{ require ...} + $gd_graph_installed = 1; + + #Check whether the module GD::Graph::colour is installed + eval{ + require GD::Graph::colour; + GD::Graph::colour->import(qw(:colours :lists :files :convert)); + }; + + if ($@) { + warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; + sleep(2); + $gd_graph_installed = 0; + } + + + } + else{ + warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; + sleep(2); + } + + + my $graph_title; + my $graph1; + my $graph2; + + if ( $gd_graph_installed){ + $graph1 = GD::Graph::lines->new(800,600); + if ($paired){ + $graph2 = GD::Graph::lines->new(800,600); + } + } + + foreach my $context (qw(CpG CHG CHH)){ + @{$mbias_read1[0]} = (); + + if ($paired){ + print MBIAS "$context context (R1)\n================\n"; + $graph_title = 'M-bias (Read 1)'; + } + else{ + print MBIAS "$context context\n===========\n"; + $graph_title = 'M-bias'; + } + print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_1){ + + unless (defined $mbias_1{$context}->{$pos}->{meth}){ + $mbias_1{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_1{$context}->{$pos}->{un}){ + $mbias_1{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; + + print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + push @{$mbias_read1[0]},$pos; + + if ($context eq 'CpG'){ + push @{$mbias_read1[1]},$percent; + push @{$mbias_read1[4]},$coverage; + } + elsif ($context eq 'CHG'){ + push @{$mbias_read1[2]},$percent; + push @{$mbias_read1[5]},$coverage; + } + elsif ($context eq 'CHH'){ + push @{$mbias_read1[3]},$percent; + push @{$mbias_read1[6]},$coverage; + } + } + print MBIAS "\n"; + } + + if ( $gd_graph_installed){ + + add_colour(nice_blue => [31,120,180]); + add_colour(nice_orange => [255,127,0]); + add_colour(nice_green => [51,160,44]); + add_colour(pale_blue => [153,206,227]); + add_colour(pale_orange => [253,204,138]); + add_colour(pale_green => [191,230,207]); + + $graph1->set( + x_label => 'position (bp)', + y1_label => '% methylation', + y2_label => '# methylation calls', + title => $graph_title, + line_width => 2, + x_max_value => $max_length_1, + x_min_value => 0, + y_tick_number => 10, + y_label_skip => 2, + y1_max_value => 100, + y1_min_value => 0, + y_label_skip => 2, + y2_min_value => 0, + x_label_skip => 5, + x_label_position => 0.5, + x_tick_offset => -1, + bgclr => 'white', + transparent => 0, + two_axes => 1, + use_axis => [1,1,1,2,2,2], + legend_placement => 'RC', + legend_spacing => 6, + legend_marker_width => 24, + legend_marker_height => 18, + dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], + ) or die $graph1->error; + + $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); + + ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014) + if (my $gd1 = $graph1->plot(\@mbias_read1)) { + open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; + binmode MBIAS_G1; + print MBIAS_G1 $gd1->png; + } + else { + warn "WARNING: Cannot generate read 1 M-bias plot: " , $graph1->error , "\n\n"; + } + + # my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error; + # open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; + # binmode MBIAS_G1; + # print MBIAS_G1 $gd1->png; + } + + if ($paired){ + + foreach my $context (qw(CpG CHG CHH)){ + @{$mbias_read2[0]} = (); + + print MBIAS "$context context (R2)\n================\n"; + print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + foreach my $pos (1..$max_length_2){ + + unless (defined $mbias_2{$context}->{$pos}->{meth}){ + $mbias_2{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_2{$context}->{$pos}->{un}){ + $mbias_2{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; + + print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + + push @{$mbias_read2[0]},$pos; + + if ($context eq 'CpG'){ + push @{$mbias_read2[1]},$percent; + push @{$mbias_read2[4]},$coverage; + } + elsif ($context eq 'CHG'){ + push @{$mbias_read2[2]},$percent; + push @{$mbias_read2[5]},$coverage; + } + elsif ($context eq 'CHH'){ + push @{$mbias_read2[3]},$percent; + push @{$mbias_read2[6]},$coverage; + } + } + print MBIAS "\n"; + } + + if ( $gd_graph_installed){ + + add_colour(nice_blue => [31,120,180]); + add_colour(nice_orange => [255,127,0]); + add_colour(nice_green => [51,160,44]); + add_colour(pale_blue => [153,206,227]); + add_colour(pale_orange => [253,204,138]); + add_colour(pale_green => [191,230,207]); + + $graph2->set( + x_label => 'position (bp)', + line_width => 2, + x_max_value => $max_length_1, + x_min_value => 0, + y_tick_number => 10, + y_label_skip => 2, + y1_max_value => 100, + y1_min_value => 0, + y_label_skip => 2, + y2_min_value => 0, + x_label_skip => 5, + x_label_position => 0.5, + x_tick_offset => -1, + bgclr => 'white', + transparent => 0, + two_axes => 1, + use_axis => [1,1,1,2,2,2], + legend_placement => 'RC', + legend_spacing => 6, + legend_marker_width => 24, + legend_marker_height => 18, + dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], + x_label => 'position (bp)', + y1_label => '% methylation', + y2_label => '# calls', + title => 'M-bias (Read 2)', + ) or die $graph2->error; + + $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); + + ### Failure to plot the MBIAS graph will now generate a warning instead of dieing (previous version below. Suggested by Andrew DeiRossi, 05 June 2014) + if (my $gd2 = $graph2->plot(\@mbias_read2)) { + open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; + binmode MBIAS_G2; + print MBIAS_G2 $gd2->png; + } + else { + warn "WARNING: Cannot generate Read 2 M-bias plot: " , $graph2->error , "\n\n"; + } + + # my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error; + # open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; + # binmode MBIAS_G2; + # print MBIAS_G2 $gd2->png; + + } + } +} + +sub process_commandline{ + my $help; + my $single_end; + my $paired_end; + my $ignore; + my $ignore_r2; + my $genomic_fasta; + my $full; + my $report; + my $extractor_version; + my $no_overlap; + my $merge_non_CpG; + my $vanilla; + my $output_dir; + my $no_header; + my $bedGraph; + my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status + my $remove; + my $counts; + my $cytosine_report; + my $genome_folder; + my $zero; + my $CpG_only; + my $CX_context; + my $split_by_chromosome; + my $sort_size; + my $samtools_path; + my $gzip; + my $mbias_only; + my $mbias_off; + my $gazillion; + my $ample_mem; + my $include_overlap; + my $ignore_3prime; + my $ignore_3prime_r2; + my $multicore; + + my $command_line = GetOptions ('help|man' => \$help, + 'p|paired-end' => \$paired_end, + 's|single-end' => \$single_end, + 'fasta' => \$genomic_fasta, + 'ignore=i' => \$ignore, + 'ignore_r2=i' => \$ignore_r2, + 'comprehensive' => \$full, + 'report' => \$report, + 'version' => \$extractor_version, + 'no_overlap' => \$no_overlap, + 'merge_non_CpG' => \$merge_non_CpG, + 'vanilla' => \$vanilla, + 'o|output=s' => \$output_dir, + 'no_header' => \$no_header, + 'bedGraph' => \$bedGraph, + "cutoff=i" => \$coverage_threshold, + "remove_spaces" => \$remove, + "counts" => \$counts, + "cytosine_report" => \$cytosine_report, + 'g|genome_folder=s' => \$genome_folder, + "zero_based" => \$zero, + "CX|CX_context" => \$CX_context, + "split_by_chromosome" => \$split_by_chromosome, + "buffer_size=s" => \$sort_size, + 'samtools_path=s' => \$samtools_path, + 'gzip' => \$gzip, + 'mbias_only' => \$mbias_only, + 'mbias_off' => \$mbias_off, + 'gazillion|scaffolds' => \$gazillion, + 'ample_memory' => \$ample_mem, + 'include_overlap' => \$include_overlap, + 'ignore_3prime=i' => \$ignore_3prime, + 'ignore_3prime_r2=i' => \$ignore_3prime_r2, + 'multicore=i' => \$multicore, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($extractor_version){ + print << "VERSION"; + + + Bismark Methylation Extractor + + Bismark Extractor Version: $version + Copyright 2010-15 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + + ### no files provided + unless (@ARGV){ + die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; + } + @filenames = @ARGV; + + warn "\n *** Bismark methylation extractor version $version ***\n\n"; + + ### M-BIAS ONLY + if ($mbias_only){ + + if ($mbias_off){ + die "Options '--mbias_only' and '--mbias_off' are not compatible. Just pick one, mkay?\n"; + } + if ($bedGraph){ + die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n"; + } + if ($cytosine_report){ + die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n"; + } + if ($merge_non_CpG){ + warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n"; + } + if ($full){ + warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n"; + } + sleep(3); + } + + ### PRINT A REPORT + unless ($report){ + $report = 1; # making this the default + } + + ### OUTPUT DIR PATH + if ($output_dir){ + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + else{ + $output_dir = ''; + } + + ### NO HEADER + unless ($no_header){ + $no_header = 0; + } + + ### OLD (VANILLA) OUTPUT FORMAT + unless ($vanilla){ + $vanilla = 0; + } + + if ($single_end){ + $paired_end = 0; ### SINGLE END ALIGNMENTS + } + elsif ($paired_end){ + $single_end = 0; ### PAIRED-END ALIGNMENTS + } + else{ + + ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header + + if ($vanilla){ + die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n"; + } + else{ # SAM/BAM format + + my $file = $filenames[0]; + warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1); + + ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file + if ($file =~ /\.gz$/){ + open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n"; + } + elsif ($file =~ /\.bam$/ || isBam($file) ){ ### this would allow to read BAM files that do not end in *.bam + open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n"; + } + else{ + open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; + } + + while (){ + last unless (/^\@/); + if ($_ =~ /^\@PG/){ + # warn "found the \@PG line:\n"; + # warn "$_"; + + if ($_ =~ /\s+-1\s+/ and $_ =~ /\s+-2\s+/){ + warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1); + $paired_end = 1; + $single_end = 0; + } + else{ + warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1); + $paired_end = 0; + $single_end = 1; + } + } + } + + # close DETERMINE or warn $!; # this always throws an error anyway... + + } + } + + ### IGNORING 5' END + # bases at the start of the read when processing the methylation call string + unless ($ignore){ + $ignore = 0; + } + + if (defined $ignore_r2){ + die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end); + } + else{ + $ignore_r2 = 0; + } + + ### IGNORING 3' END + # bases at the end of the read when processing the methylation call string + unless ($ignore_3prime){ + $ignore_3prime = 0; + } + + if (defined $ignore_3prime_r2){ + die "You can only specify --ignore_3prime_r2 for paired-end result files\n" unless ($paired_end); + } + else{ + $ignore_3prime_r2 = 0; + } + + + ### NO OVERLAP + ### --no_overlap is the default (as of version 0.12.6), unless someone explicitly asks to include overlaps + if ($include_overlap){ + die "The option '--include_overlap' can only be specified for paired-end input!\n" unless ($paired_end); + warn "Setting option '--inlcude_overlap' for paired-end data (user-defined)\n\n"; + $no_overlap = 0; + } + else{ # default + if ($paired_end){ + warn "Setting option '--no_overlap' since this is (normally) the right thing to do for paired-end data\n\n"; + $no_overlap = 1; + } + } + + ### COMPREHENSIVE OUTPUT + unless ($full){ + $full = 0; + } + + ### MERGE NON-CpG context + unless ($merge_non_CpG){ + $merge_non_CpG = 0; + } + + ### remove white spaces in read ID (needed for sorting using the sort command + unless ($remove){ + $remove = 0; + } + + ### COVERAGE THRESHOLD FOR bedGraph OUTPUT + if (defined $coverage_threshold){ + unless ($coverage_threshold > 0){ + die "Please select a coverage greater than 0 (positive integers only)\n"; + } + } + else{ + $coverage_threshold = 1; + } + + ### SORT buffer size + if (defined $sort_size){ + unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ + die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; + } + } + else{ + $sort_size = '2G'; + } + + if ($zero){ + die "Option '--zero' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); + } + + if ($CX_context){ + die "Option '--CX_context' is only available if '--bedGraph' or '--cytosine_report' are specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); + } + else{ + $CX_context = 0; + } + + unless ($counts){ + $counts = 1; # counts will always be set + } + + if ($cytosine_report){ + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + } + else{ + die "Please specify a genome folder to proceed (full path only)\n"; + } + + unless ($bedGraph){ + warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; + $bedGraph = 1; + } + unless ($counts){ + # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; + $counts = 1; + } + warn "\n"; + } + + ### PATH TO SAMTOOLS + if (defined $samtools_path){ + # if Samtools was specified as full command + if ($samtools_path =~ /samtools$/){ + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + else{ + unless ($samtools_path =~ /\/$/){ + $samtools_path =~ s/$/\//; + } + $samtools_path .= 'samtools'; + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + } + # Check whether Samtools is in the PATH if no path was supplied by the user + else{ + if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH + $samtools_path = `which samtools`; + chomp $samtools_path; + } + } + + unless (defined $samtools_path){ + $samtools_path = ''; + } + + + if ($gazillion){ + if ($ample_mem){ + die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; + } + } + + if (defined $multicore){ + unless ($multicore > 0){ + die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n"; + } + if ($multicore > 20){ + warn "Core usage currently set to more than 20 threads. Let's see how this goes... (set value: $multicore)\n\n"; + } + } + else{ + $multicore = 1; # default. Single-thread mode + warn "Setting core usage to single-threaded (default). Consider using --multicore to speed up the extraction process.\n\n"; + } + + return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_off,$mbias_only,$gazillion,$ample_mem,$ignore_3prime,$ignore_3prime_r2,$multicore); +} + + +sub test_positional_sorting{ + + my $filename = shift; + + print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; + sleep(1); + + if ($filename =~ /\.gz$/) { + open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam + if ($samtools_path){ + open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (TEST,$filename) or die "Can't open file $filename: $!\n"; + } + + my $count = 0; + + while () { + if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) + if (/^\@SO/) { + die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n"; + } + next; + } + $count++; + + last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID + + my ($id_1) = (split (/\t/)); + + ### reading the next line which should be read 2 + $_ = ; + my ($id_2) = (split (/\t/)); + last unless ($id_2); + ++$count; + + if ($id_1 eq $id_2){ + ### ids are the same + next; + } + else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first + my $id_1_trunc = $id_1; + $id_1_trunc =~ s/\/1$//; + my $id_2_trunc = $id_2; + $id_2_trunc =~ s/\/2$//; + + unless ($id_1_trunc eq $id_2_trunc){ + die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n"; + } + } + } + # close TEST or die $!; somehow fails on our cluster... + ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) + warn "...passed!\n"; + sleep(1); + +} + +sub process_Bismark_results_file{ + + my $filename = shift; + my $report_filename = open_output_filehandles($filename); + + ### disabling buffering so we don't run into problems with half written out lines... + foreach my $fh (keys %fhs){ + if ($fh =~ /^[1230]$/) { + foreach my $context (keys %{$fhs{$fh}}) { + select($fhs{$fh}->{$context}); + $|++; + } + } + else{ + select($fhs{$fh}); + $|++; + } + } + select(STDOUT); + + ################################################ + ################################################ + ### multi-process handling + ### + + my $offset = 1; + my $process_id; + my @pids; + if ($multicore > 1){ + + until ($offset == $multicore){ + # warn "multicore: $multicore\noffset: $offset\n"; + my $fork = fork; + + if (defined $fork){ + if ($fork != 0){ + $process_id = $fork; + push @pids, $process_id; + if ($offset < $multicore){ + ++$offset; + # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n"; + } + else{ + # warn "Reached the number of maximum multicores. Proceeeding to processing...\n"; + } + } + elsif ($fork == 0){ + # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n"; + $process_id = $fork; + last; + } + } + else{ + die "Forking unsuccessful. Proceeding using a single thread only\n"; + } + } + + # warn "\nThe thread identity\n===================\n"; + if ($process_id){ + # print "I am the parent process. My children are called:\n"; + # print join ("\t",@pids),"\n"; + # print "I am going to process the following line count: $offset\n\n"; + } + elsif($process_id == 0){ + # warn "I am a child process: Process ID: $process_id\n"; + # warn "I am going to process the following line count: $offset\n\n"; + } + else{ + die "Process ID was: $process_id\n"; + } + } + else{ + # warn "Single-core mode: setting pid to 1\n"; + $process_id = 1; + } + + ################################################ + ################################################ + if ($process_id){ + warn "Now reading in Bismark result file $filename\n"; + } + else{ + warn "\nNow reading in Bismark result file $filename\n\n"; + } + + if ($filename =~ /\.gz$/) { + open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/ || isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam + if ($samtools_path){ + open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (IN,$filename) or die "Can't open file $filename: $!\n"; + } + + ### Vanilla and SAM output need to read different numbers of header lines + if ($vanilla) { + my $bismark_version = ; ## discarding the Bismark version info + chomp $bismark_version; + $bismark_version =~ s/\r//; # replaces \r line feed + $bismark_version =~ s/Bismark version: //; + if ($bismark_version =~ /^\@/) { + warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; + sleep (1); + } + + unless ($version eq $bismark_version){ + die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; + } + } else { + # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. + # We are reading from it further down + } + + my $methylation_call_strings_processed = 0; + my $line_count = 0; + + ### proceeding differently now for single-end or paired-end Bismark files + + ### PROCESSING SINGLE-END RESULT FILES + if ($single) { + + ### also proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark output format + while () { + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or + ### last position + chomp $genome_conversion; + + my $index; + if ($meth_call) { + + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### Clipping off the first number of bases from the methylation call string as specified with --ignore + if ($ignore) { + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + $start += $ignore; + } + elsif ($strand eq '-') { + $start += length($meth_call)-1; ## $meth_call is already shortened! + } + else { + die "Alignment did not have proper strand information: $strand\n"; + } + } + + ### Clipping off the last number of bases from the methylation call string as specified with --ignore_3prime + if ($ignore_3prime) { + + $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime); + + ### If we are clipping off some bases at the end we need to adjust the end position of the alignments accordingly + if ($strand eq '+') { + # clipping the 3' end does not affect the starting position # ignore 5' has already been taken care of, if relevant at all + } + elsif ($strand eq '-') { + # here we need to discriminate if the start has been adjusted because of --ignore or not + if ($ignore){ + # position adjusted already, and because of this 3' trimming is irrelevant for the start position + } + else{ + # Here we need to add the length ignore_3prime to the read starting position, adjustment of the start position will take place later in the methylation extraction step + $start += $ignore_3prime; + } + } + else { + die "Alignment did not have proper strand information: $strand\n"; + } + } + ### just as a comment, if --ignore has not been specified the starting position of reverse reads is adjusted later at the methylation extraction stage + + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } else { # processing single-end SAM format (default) + while () { + chomp; + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); + next; + } + + ++$line_count; + # warn "$line_count\n"; + warn "Processed lines: $line_count\n" if ($line_count%500000 == 0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + + # example read in SAM format + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + ### + + # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + # < 0.7.6 $meth_call =~ s/^XM:Z://; + # < 0.7.6 $read_conversion =~ s/^XR:Z://; + # < 0.7.6 $genome_conversion =~ s/^XG:Z://; + + my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; + + ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call; ### Thanks to Zachary Zeno for this solution + my $read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call = $value; + $meth_call =~ s/\r//; + } elsif ($tag eq "XR") { + $read_conversion = $value; + $read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + my $strand; + # warn "$meth_call\n$read_conversion\n$genome_conversion\n"; + + my $index; + if ($meth_call) { + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + $strand = '+'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + $strand = '-'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + $strand = '+'; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output + if ($strand eq '-') { + $meth_call = reverse $meth_call; + } + # warn "\n$meth_call\n"; + + ### IGNORE 5 PRIME: Clipping off the first number of bases from the methylation call string as specified with --ignore + if ($ignore) { + # warn "\n\n$meth_call\n"; + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + # warn "$meth_call\n";sleep(1); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my @comp_cigar; # building an array with all CIGAR operations + foreach my $index (0..$#len) { + foreach (1..$len[$index]) { + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # print "original CIGAR: $cigar\n"; + # print "original CIGAR: @comp_cigar\n"; + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + + my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions + my $I_count = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count++; + $op = shift @comp_cigar; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count++; + } + } + $start += $ignore + $D_count - $I_count; + # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; + } + elsif ($strand eq '-') { + + for (1..$ignore) { + my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar; + } + } + + ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR + ### string to be able to work out the starting position of the read which is on the 3' end of the sequence + my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position + foreach (@comp_cigar) { + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); + } + $start += $MD_count - 1; + } + + ### reconstituting shortened CIGAR string + my $new_cigar; + my $count = 0; + my $last_op; + # print "ignore adjusted: @comp_cigar\n"; + foreach my $op (@comp_cigar) { + unless (defined $last_op){ + $last_op = $op; + ++$count; + next; + } + if ($last_op eq $op) { + ++$count; + } else { + $new_cigar .= "$count$last_op"; + $last_op = $op; + $count = 1; + } + } + $new_cigar .= "$count$last_op"; # appending the last operation and count + $cigar = $new_cigar; + # print "ignore adjusted scalar: $cigar\n"; + } + + ####################### + ### INGORE 3' END ### + ####################### + + # Clipping off the last number of bases from the methylation call string as specified with --ignore_3prime + if ($ignore_3prime) { + # warn "$meth_call\n"; + $meth_call = substr($meth_call,0, (length($meth_call)) - $ignore_3prime); + # warn "$meth_call\n";sleep(1); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my @comp_cigar; # building an array with all CIGAR operations + foreach my $index (0..$#len) { + foreach (1..$len[$index]) { + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + + # print "original CIGAR: $cigar\n"; + # print join ("",@comp_cigar),"\n"; + + ### If we are clipping off some bases at the end we might have to adjust the start position of the alignments accordingly + if ($strand eq '+') { + + ### clipping the 3' end does not affect the starting position of forward strand alignments + # ignore 5' has already been taken care of at this stage, if relevant at all + + for (1..$ignore_3prime) { + my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations from the end + # print join ("",@comp_cigar),"\n"; + # print "$_ deleted $op from 3' end\n"; + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar; + # print join ("",@comp_cigar),"\n"; + # print "$_ deleted $op from 3' end\n"; + } + } + # print "Final truncated CIGAR string:\n"; + # print join ("",@comp_cigar),"\n"; + # $start += $ignore + $D_count - $I_count; + # print "start $start\t ignore_3prime: $ignore_3prime\t D count: $D_count I_count: $I_count\n"; + } + elsif ($strand eq '-') { + + my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions + my $I_count = 0; + + for (1..$ignore_3prime) { + my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $D_count++; + $op = shift @comp_cigar; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count++; + } + + } + + # here we need to discriminate if the start has been adjusted because of --ignore or not + if ($ignore){ + # the start position has already been modified for --ignore already, so we don't have to adjust the position again now + } + else{ + # Here we need to add the length ignore_3prime to the read starting position + # adjustment of the true starting position of this reverse read will take place later in the methylation extraction step + $start += $ignore_3prime + $D_count - $I_count; + } + + } + + ### reconstituting shortened CIGAR string + my $new_cigar; + my $count = 0; + my $last_op; + # print "ignore_3prime adjusted:\n"; print join ("",@comp_cigar),"\n"; + foreach my $op (@comp_cigar) { + unless (defined $last_op){ + $last_op = $op; + ++$count; + next; + } + if ($last_op eq $op) { + ++$count; + } else { + $new_cigar .= "$count$last_op"; + $last_op = $op; + $count = 1; + } + } + $new_cigar .= "$count$last_op"; # appending the last operation and count + $cigar = $new_cigar; + # print "ignore_3prime adjusted scalar: $cigar\n"; + } + } + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } + + ### PROCESSING PAIRED-END RESULT FILES + elsif ($paired) { + + ### proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark paired-end output format + while () { + ++$line_count; + warn "processed line: $line_count\n" if ($line_count%500000 == 0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + next; + } + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; + + my $index; + chomp $genome_conversion; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB!!! + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT!!! + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + if ($meth_call_1 and $meth_call_2) { + ### Clipping off the first number of bases from the methylation call strings as specified with '--ignore ' + + ### IGNORE FROM 5' END + if ($ignore) { + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified + $start_read_1 += $ignore; + } + if ($ignore_r2) { + $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified + $end_read_2 -= $ignore_r2; + } + + ### IGNORE 3' END + + ### Clipping off the last number of bases from the methylation call string of Read 1 as specified with --ignore_3prime + if ($ignore_3prime) { + $meth_call_1 = substr($meth_call_1,0, length($meth_call_1) - $ignore_3prime); + # we don't have to adjust the position now since the shortened methylation call will be fine, see below + } + ### Clipping off the last number of bases from the methylation call string of Read 2 as specified with --ignore_3prime_r2 + if ($ignore_3prime_r2) { + $meth_call_2 = substr($meth_call_2,0, length($meth_call_2) - $ignore_3prime_r2); + # we don't have to adjust the position now since the shortened methylation call will be fine, see below + } + + my $end_read_1; + my $start_read_2; + + if ($strand eq '+') { + + $end_read_1 = $start_read_1 + length($meth_call_1) - 1; + $start_read_2 = $end_read_2 - length($meth_call_2) + 1; + + ## we first pass the first read which is in + orientation on the forward strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2); + } + else { + + $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! + $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! + + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + } + } + else { # Bismark paired-end BAM/SAM output format (default) + while () { + chomp; + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_\n" unless ($process_id == 0); # no additional warnings for child procesess + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + if ( ($line_count - $offset)%$multicore == 0){ + # warn "line count: $line_count\noffset: $offset\n"; + # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; + # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; + } + else{ + # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; + $_ = ; # reading and skipping another line since this is the paired-end read + next; + } + + # example paired-end reads in SAM format (2 consecutive lines) + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT + + my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call_1; + my $first_read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_1 = $value; + $meth_call_1 =~ s/\r//; + } elsif ($tag eq "XR") { + $first_read_conversion = $value; + $first_read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + $_ = ; # reading in the paired read + chomp; + + my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + + my $meth_call_2; + my $second_read_conversion; + + while ( /(XM|XR):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_2 = $value; + $meth_call_2 =~ s/\r//; + } elsif ($tag eq "XR") { + $second_read_conversion = $value; + $second_read_conversion = s/\r//; + } + } + + # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; + # chomp $genome_conversion; # in case it captured a new line character # already removed + + # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; + + my $index; + my $strand; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + $strand = '+'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT + $strand = '-'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB + $strand = '+'; + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + ### reversing the methylation call of the read that was reverse-complemented + if ($strand eq '+') { + $meth_call_2 = reverse $meth_call_2; + } else { + $meth_call_1 = reverse $meth_call_1; + } + # warn "\n$meth_call_1\n$meth_call_2\n"; + + if ($meth_call_1 and $meth_call_2) { + + my $end_read_1; + + ### READ 1 + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + + die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1); + + my @comp_cigar_1; # building an array with all CIGAR operations + foreach my $index (0..$#len_1) { + foreach (1..$len_1[$index]) { + # print "$ops_1[$index]"; + push @comp_cigar_1, $ops_1[$index]; + } + } + # print "original CIGAR read 1: $cigar_1\n"; + # print "original CIGAR read 1: @comp_cigar_1\n"; + + ### READ 2 + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + my @comp_cigar_2; # building an array with all CIGAR operations for read 2 + foreach my $index (0..$#len_2) { + foreach (1..$len_2[$index]) { + # print "$ops_2[$index]"; + push @comp_cigar_2, $ops_2[$index]; + } + } + # print "original CIGAR read 2: $cigar_2\n"; + # print "original CIGAR read 2: @comp_cigar_2\n"; + + ################################## + ### IGNORE BASES FROM 5' END ### + ################################## + + if ($ignore) { + ### Clipping off the first number of bases from the methylation call string as specified with '--ignore ' for read 1 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_1) - $ignore) <= 0){ + # next; # skipping this read entirely if the read is shorter than the portion to be ignored + $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start + my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions + my $I_count_1 = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_1++; + $op = shift @comp_cigar_1; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_1++; + } + } + + $start_read_1 += $ignore + $D_count_1 - $I_count_1; + # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; + + # the start position of reads mapping to the reverse strand is being adjusted further below + } + elsif ($strand eq '-') { + + ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back + for (1..$ignore) { + my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_1; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + + } + } + } + + if ($ignore_r2) { + ### Clipping off the first number of bases from the methylation call string as specified with '--ignore_r2 ' for read 2 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_2) - $ignore_r2) <= 0){ + # next; # skipping this read entirely if the read is shorter than the portion to be ignored # this would skip the entire read pair! + $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back + + for (1..$ignore_r2) { + my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_2; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + } + elsif ($strand eq '-') { + + ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start + my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions + my $I_count_2 = 0; + + for (1..$ignore_r2) { + my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_2++; + $op = shift @comp_cigar_2; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_2++; + } + } + + $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2; + # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; + } + } + } + + if ($ignore and $meth_call_1){ # if the methylation call string is undefined at this position we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 1 + my $new_cigar_1; + my $count_1 = 0; + my $last_op_1; + # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; + foreach my $op (@comp_cigar_1) { + unless (defined $last_op_1){ + $last_op_1 = $op; + ++$count_1; + next; + } + if ($last_op_1 eq $op) { + ++$count_1; + } else { + $new_cigar_1 .= "$count_1$last_op_1"; + $last_op_1 = $op; + $count_1 = 1; + } + } + $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count + $cigar_1 = $new_cigar_1; + # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; + } + + if ($ignore_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 2 + my $new_cigar_2; + my $count_2 = 0; + my $last_op_2; + # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; + foreach my $op (@comp_cigar_2) { + unless (defined $last_op_2){ + $last_op_2 = $op; + ++$count_2; + next; + } + if ($last_op_2 eq $op) { + ++$count_2; + } + else { + $new_cigar_2 .= "$count_2$last_op_2"; + $last_op_2 = $op; + $count_2 = 1; + } + } + $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count + $cigar_2 = $new_cigar_2; + # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; + } + + ########################### + ### END IGNORE 5' END ### + ########################### + + ################################## + ### IGNORE BASES FROM 3' END ### + ################################## + + # print "CIGAR string before truncating 3' end (Read 1)\n"; + # print join ("",@comp_cigar_1),"\n"; + + if ($ignore_3prime and $meth_call_1) { # if the methylation call string is undefined at this point we don't need to process the read any further + + ### Clipping off the last number of bases from the methylation call string as specified with '--ignore_3prime ' for read 1 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_1) - $ignore_3prime) <= 0){ + $meth_call_1 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_1 = substr($meth_call_1,0,length($meth_call_1) - $ignore_3prime); + # warn "truncated meth_call 1:\n$meth_call_1\n"; + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', clipping the 3' end does not affect the starting position of forward strand alignments + # ignore 5' has already been taken care of at this stage, if relevant at all + + for (1..$ignore_3prime) { + my $op = pop @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore_3prime operations from the end + # print "$_ deleted $op from 3' end\n"; + # print join ("",@comp_cigar_1),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_1; + # print join ("",@comp_cigar_1),"\n"; + # print "$_ deleted $op from 3' end\n"; + } + } + + # print "Final truncated CIGAR string (Read 1):\n"; + # print join ("",@comp_cigar_1),"\n"; + + } + elsif ($strand eq '-') { + + my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions + my $I_count_1 = 0; + + ### if the (read 1) strand information is '-', the read 1 CIGAR string needs to be trimmed from the start + for (1..$ignore_3prime) { + my $op = shift @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore_3prime operations, here the first value of the array + # print join ("",@comp_cigar_1),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_1++; + $op = shift @comp_cigar_1; + # print join ("",@comp_cigar_1),"\n"; + } + + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_1++; + } + } + + # print "Final truncated CIGAR string reverse_read:\n"; + # print join ("",@comp_cigar_1),"\n"; + + # Here we need to add the length ignore_3prime to the read starting position + # adjustment of the true start position of this reverse read will take place later in the methylation extraction step + $start_read_1 += $ignore_3prime + $D_count_1 - $I_count_1; + + } + } + } + + if ($ignore_3prime_r2 and $meth_call_2) { # if the methylation call string is undefined at this point we don't need to process the read any further + + ### Clipping off the last number of bases from the methylation call string as specified with '--ignore_3prime_r2 ' for read 2 + ### the methylation calls have already been reversed where necessary + + if ( (length($meth_call_2) - $ignore_3prime_r2) <= 0){ + $meth_call_2 = undef; # will skip this read entirely since the read is shorter than the portion to be ignored + } + else { + $meth_call_2 = substr($meth_call_2,0,length($meth_call_2) - $ignore_3prime_r2); + # warn "truncated meth_call 2:\n$meth_call_2\n"; + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string and the positions accordingly + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', clipping the 3' end of read 2 does potentially affect the starting position of read 2 (reverse strand alignment) + ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the start + + my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions + my $I_count_2 = 0; + + for (1..$ignore_3prime_r2) { + my $op = shift @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the first value of the array + # print "$_ deleted $op from 3' end\n"; + # print join ("",@comp_cigar_2),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_2++; + $op = shift @comp_cigar_2; + # print "$_ deleted $op from 3' end\n"; + # print join ("",@comp_cigar_2),"\n"; + } + + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_2++; + } + } + + # print "Final truncated CIGAR string 2 (+ alignment):\n"; + # print join ("",@comp_cigar_2),"\n"; + + # Here we need to add the length ignore_3prime_r2 to the read starting position + # adjustment of the true start position of this reverse read will take place later in the methylation extraction step + $start_read_2 += $ignore_3prime_r2 + $D_count_2 - $I_count_2; + + # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; + } + elsif ($strand eq '-') { + ### if the (read 1) strand information is '-', clipping the 3' end of read 2 does not affect its starting position (forward strand alignment) + ### ignore_r2 5' has already been taken care of at this stage, if relevant at all + + ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the end + + for (1..$ignore_3prime_r2) { + my $op = pop @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + # print join ("",@comp_cigar_2),"\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_2; + # print "$_ deleted $op\n"; + # print join ("",@comp_cigar_2),"\n"; + } + } + + # print "Final truncated CIGAR string 2 (- alignment):\n"; + # print join ("",@comp_cigar_2),"\n"; + + } + } + } + + if ($ignore_3prime and $meth_call_1){ # if the methylation call string is undefined at this point we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 1 + my $new_cigar_1; + my $count_1 = 0; + my $last_op_1; + # print "ignore_3prime adjusted CIGAR 1: @comp_cigar_1\n"; + foreach my $op (@comp_cigar_1) { + unless (defined $last_op_1){ + $last_op_1 = $op; + ++$count_1; + next; + } + if ($last_op_1 eq $op) { + ++$count_1; + } + else { + $new_cigar_1 .= "$count_1$last_op_1"; + $last_op_1 = $op; + $count_1 = 1; + } + } + $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count + $cigar_1 = $new_cigar_1; + # warn "ignore_3prime adjusted CIGAR 1 scalar: $cigar_1\n"; + } + + if ($ignore_3prime_r2 and $meth_call_2){ # if the methylation call string is undefined at this point we don't need any new CIGAR string + + ### reconstituting shortened CIGAR string 2 + my $new_cigar_2; + my $count_2 = 0; + my $last_op_2; + # print "ignore_3prime_r2 adjusted CIGAR 2: @comp_cigar_2\n"; + foreach my $op (@comp_cigar_2) { + unless (defined $last_op_2){ + $last_op_2 = $op; + ++$count_2; + next; + } + if ($last_op_2 eq $op) { + ++$count_2; + } + else { + $new_cigar_2 .= "$count_2$last_op_2"; + $last_op_2 = $op; + $count_2 = 1; + } + } + $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count + $cigar_2 = $new_cigar_2; + # warn "ignore_3prime_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; + } + + ########################### + ### END IGNORE 3' END ### + ########################### + + + ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on + + if ($strand eq '+') { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 + @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_2\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + my $MD_count_2 = 0; + foreach (@comp_cigar_2) { + ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1 + $MD_count_1 - 1; + $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand + } + else { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 + + @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_1\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1; + $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand + } + + if ($strand eq '+') { + ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1); + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2); + } + else { + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation call strings + } + } + } + } else { + die "Single-end or paired-end reads not specified properly\n"; + } + + $counting{sequences_count} = $line_count; + $counting{methylation_call_strings} = $methylation_call_strings_processed; + + if ($multicore == 1){ + print_splitting_report (); + } + elsif ($multicore > 1){ + print_splitting_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed); + print_mbias_report_multicore($report_filename,$offset,$line_count,$methylation_call_strings_processed); + } + + return ($process_id,\@pids,$report_filename); + +} + + + +sub print_splitting_report{ + + ### Calculating methylation percentages if applicable + warn "\nProcessed $counting{sequences_count} lines in total\n"; + warn "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n"; + if ($report) { + print REPORT "\nProcessed $counting{sequences_count} lines in total\n"; + print REPORT "Total number of methylation call strings processed: $counting{methylation_call_strings}\n\n"; + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_non_CpG_methylation; + if ($merge_non_CpG){ + if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); + } + } + + if ($report){ + ### detailed information about Cs analysed + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### calculating methylated CpG percentage if applicable + if ($percent_meCpG){ + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3 Context Output + else{ + ### calculating methylated CHG percentage if applicable + if ($percent_meCHG){ + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### calculating methylated CHH percentage if applicable + if ($percent_meCHH){ + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } + } + + ### detailed information about Cs analysed for on-screen report + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn"Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn"Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + warn "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3-Context Output + else{ + ### printing methylated CHG percentage if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated CHH percentage if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } +} + +### + +sub print_splitting_report_multicore{ + + my ($report_filename,$offset,$line_count,$meth_call_strings) = @_; + + # warn "\$report_filename is $report_filename\n"; + my $special_report = $report_filename.".$offset"; + + open (SPECIAL_REPORT,'>',$special_report) or die $!; + # warn "line count\t$line_count\n"; + # warn "meth call strings\t$meth_call_strings\n"; + + print SPECIAL_REPORT "line count\t$line_count\n"; + print SPECIAL_REPORT "meth call strings\t$meth_call_strings\n"; + + ### Calculating methylation percentages if applicable + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_non_CpG_methylation; + if ($merge_non_CpG){ + if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); + } + } + + if ($report){ + + ### detailed information about Cs analysed + print SPECIAL_REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print SPECIAL_REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print SPECIAL_REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print SPECIAL_REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print SPECIAL_REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print SPECIAL_REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print SPECIAL_REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print SPECIAL_REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### calculating methylated CpG percentage if applicable + if ($percent_meCpG){ + print SPECIAL_REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print SPECIAL_REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3 Context Output + else{ + ### calculating methylated CHG percentage if applicable + if ($percent_meCHG){ + print SPECIAL_REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### calculating methylated CHH percentage if applicable + if ($percent_meCHH){ + print SPECIAL_REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print SPECIAL_REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } + } + close SPECIAL_REPORT or warn "Failed to close filehandle for individual report $special_report\n"; +} + + +### + +### INDIVIDUAL M-BIAS REPORTS + +sub print_mbias_report_multicore{ + + my ($report_filename,$offset,$line_count,$meth_call_strings) = @_; + + # warn "\$report_filename is $report_filename\n"; + my $special_mbias_report = $report_filename.".${offset}.mbias"; + + open (SPECIAL_MBIAS,'>',$special_mbias_report) or die $!; + + # determining maximum read length + my $max_length_1 = 0; + my $max_length_2 = 0; + + foreach my $context (keys %mbias_1){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ + $max_length_1 = $pos unless ($max_length_1 >= $pos); + } + } + if ($paired){ + foreach my $context (keys %mbias_2){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ + $max_length_2 = $pos unless ($max_length_2 >= $pos); + } + } + } + + if ($single){ + # warn "Determining maximum read length for M-Bias plot\n"; + # warn "Maximum read length of Read 1: $max_length_1\n\n"; + } + else{ + # warn "Determining maximum read lengths for M-Bias plots\n"; + # warn "Maximum read length of Read 1: $max_length_1\n"; + # warn "Maximum read length of Read 2: $max_length_2\n\n"; + } + + foreach my $context (qw(CpG CHG CHH)){ + + if ($paired){ + print SPECIAL_MBIAS "$context context (R1)\n================\n"; + } + else{ + print SPECIAL_MBIAS "$context context\n===========\n"; + } + print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_1){ + + unless (defined $mbias_1{$context}->{$pos}->{meth}){ + $mbias_1{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_1{$context}->{$pos}->{un}){ + $mbias_1{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; + + print SPECIAL_MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + } + print SPECIAL_MBIAS "\n"; + } + + if ($paired){ + + foreach my $context (qw(CpG CHG CHH)){ + + print SPECIAL_MBIAS "$context context (R2)\n================\n"; + print SPECIAL_MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_2){ + + unless (defined $mbias_2{$context}->{$pos}->{meth}){ + $mbias_2{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_2{$context}->{$pos}->{un}){ + $mbias_2{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; + + print SPECIAL_MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + } + } + } + + close SPECIAL_MBIAS or warn "Failed to close filehandle for individual M-bias report $special_mbias_report\n"; +} + + +### + + +sub print_individual_C_methylation_states_paired_end_files{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_; + + unless (defined $meth_call) { + return; # skip this read + } + + ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2 + die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2); + + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ### U for methylated C in Unknown context (was protected) ### + ### u for not methylated C in Unknown context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + my @comp_cigar; + + ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing + if ($cigar =~ /^\d+M$/){ + # this check speeds up the extraction process by up to 60%!!! + } + else{ # parsing CIGAR string + my @len; + my @ops; + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + if ($strand eq '-') { + + ### the CIGAR string needs to be reversed, the methylation call has already been reversed above + if (@comp_cigar){ + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + } + # print "reverse CIGAR string: @comp_cigar\n"; + + ### the start position of paired-end files has already been corrected, see above + } + + ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified + + if ($merge_non_CpG) { + if ($no_overlap) { # this has to be read 2... + + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.'){} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } + elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } else { + die "The read orientation was neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation was neither + nor -: '$strand'/n"; + } + } + } + + ### this is the default paired-end procedure allowing overlaps and using every single C position + ### Still within the 2-CONTEXT ONLY optional section + else { + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + ### still within the 2-CONTEXT optional section + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + } + + ############################################ + ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### + ############################################ + + elsif ($no_overlap) { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + + ### this is the paired-end procedure allowing overlaps and using every single C position + else { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } +} + +sub check_cigar_string { + my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; + # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; + my ($new_cigar_offset,$new_pos_offset) = (0,0); + + if ($strand eq '+') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by -1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by another -1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another +1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + + elsif ($strand eq '-') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position + # warn "adjusted genomic position by +1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "Found new 'M' operation; position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position + # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another -1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; + return ($new_cigar_offset,$new_pos_offset); +} + +sub print_individual_C_methylation_states_single_end{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + + my @comp_cigar; + + if ($cigar){ # parsing CIGAR string + + ### Checking whether the CIGAR string is a linear genomic match or whether it requires indel processing + if ($cigar =~ /^\d+M$/){ + # warn "See!? I told you so! $cigar\n"; + # sleep(1); + } + else{ + + my @len; + my @ops; + + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops); + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + ### adjusting the start position for all reads mapping to the reverse strand + if ($strand eq '-') { + + if (@comp_cigar){ # only needed for SAM reads with InDels + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print @comp_cigar,"\n"; + } + + unless ($ignore){ ### if --ignore was specified the start position has already been corrected + + if ($cigar){ ### SAM format + if ($cigar =~ /^(\d+)M$/){ # linear match + $start += $1 - 1; + } + else{ # InDel read + my $MD_count = 0; + foreach (@comp_cigar){ + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + $start += $MD_count - 1; + } + } + else{ ### vanilla format + $start += length($meth_call)-1; + } + } + } + + ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) + + ### single-file CpG and other-context output + if ($full and $merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + # print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + # print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + my $line = join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + my $line = join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + # print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + #print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{CpG_context}} $line unless($mbias_only); + #print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + my $line = join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + my $line = join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n"; + print {$fhs{other_context}} $line unless($mbias_only); + #print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.'){} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + elsif ($merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION + + elsif ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The read had a strand orientation which was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index])."\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } +} + +sub open_output_filehandles{ + + my $filename = shift; + + my $output_filename = (split (/\//,$filename))[-1]; + my $report_filename = $output_filename; + + ### OPENING OUTPUT-FILEHANDLES + if ($report) { + $report_filename =~ s/\.sam$//; + $report_filename =~ s/\.txt$//; + $report_filename =~ s/$/_splitting_report.txt/; + $report_filename = $output_dir . $report_filename; + open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; + } + + if ($report) { + + print REPORT "$output_filename\n\n"; + print REPORT "Parameters used to extract methylation information:\n"; + print REPORT "Bismark Extractor Version: $version\n"; + + if ($paired) { + if ($vanilla) { + print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: paired-end (SAM format)\n"; # default + } + } + + if ($single) { + if ($vanilla) { + print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: single-end (SAM format)\n"; # default + } + } + if ($single){ + if ($ignore) { + print REPORT "Ignoring first $ignore bp\n"; + } + if ($ignore_3prime) { + print REPORT "Ignoring last $ignore_3prime bp\n"; + } + } + else{ # paired-end + if ($ignore) { + print REPORT "Ignoring first $ignore bp of Read 1\n"; + } + if ($ignore_r2){ + print REPORT "Ignoring first $ignore_r2 bp of Read 2\n"; + } + + if ($ignore_3prime) { + print REPORT "Ignoring last $ignore_3prime bp of Read 1\n"; + } + if ($ignore_3prime_r2){ + print REPORT "Ignoring last $ignore_3prime_r2 bp of Read 2\n"; + } + + } + + if ($full) { + print REPORT "Output specified: comprehensive\n"; + } else { + print REPORT "Output specified: strand-specific (default)\n"; + } + + if ($no_overlap) { + print REPORT "No overlapping methylation calls specified\n"; + } + if ($genomic_fasta) { + print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; + } + if ($merge_non_CpG) { + print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; + } + + print REPORT "\n"; + } + + ##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + + ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL + ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file + if ($full and $merge_non_CpG) { + my $cpg_output = my $other_c_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + else{ ### disclaimer: I am aware of "The Useless Use of Cat Awards", but I saw no other option... + open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); + push @sorting_files,$cpg_output; + + unless ($no_header) { + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in any other context than CpG + $other_c_output =~ s/^/Non_CpG_context_/; + $other_c_output =~ s/sam$/txt/; + $other_c_output =~ s/bam$/txt/; + $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); + $other_c_output = $output_dir . $other_c_output; + + if ($gzip){ + $other_c_output .= '.gz'; + open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only); + } + else{ + open ($fhs{other_context},"| cat > $other_c_output") or die "Failed to write to $other_c_output $!\n" unless($mbias_only); + # open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only); + push @sorting_files,$other_c_output; + + + unless ($no_header) { + print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + + ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found + elsif ($merge_non_CpG) { + + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in Non-CpG (CC, CT or CA) context + my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; + + $other_c_ot =~ s/^/Non_CpG_OT_/; + $other_c_ot =~ s/sam$/txt/; + $other_c_ot =~ s/bam$/txt/; + $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); + $other_c_ot = $output_dir . $other_c_ot; + + if ($gzip){ + $other_c_ot .= '.gz'; + open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{other_c},"| cat > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only); + push @sorting_files,$other_c_ot; + + unless($no_header){ + print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ctot =~ s/^/Non_CpG_CTOT_/; + $other_c_ctot =~ s/sam$/txt/; + $other_c_ctot =~ s/bam$/txt/; + $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); + $other_c_ctot = $output_dir . $other_c_ctot; + + if ($gzip){ + $other_c_ctot .= '.gz'; + open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{other_c},"| cat > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only); + push @sorting_files,$other_c_ctot; + + unless($no_header){ + print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ctob =~ s/^/Non_CpG_CTOB_/; + $other_c_ctob =~ s/sam$/txt/; + $other_c_ctob =~ s/bam$/txt/; + $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); + $other_c_ctob = $output_dir . $other_c_ctob; + + if ($gzip){ + $other_c_ctob .= '.gz'; + open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{other_c},"| cat > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only); + push @sorting_files,$other_c_ctob; + + unless($no_header){ + print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ob =~ s/^/Non_CpG_OB_/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); + $other_c_ob = $output_dir . $other_c_ob; + + if ($gzip){ + $other_c_ob .= '.gz'; + open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{other_c},"| cat > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only); + push @sorting_files,$other_c_ob; + + unless($no_header){ + print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) + + ### if --comprehensive was specified we are only writing one file per context + elsif ($full) { + my $cpg_output = my $chg_output = my $chh_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + else{ + open ($fhs{CpG_context},"| cat > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + # open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); + push @sorting_files,$cpg_output; + + unless($no_header){ + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in CHG context + $chg_output =~ s/^/CHG_context_/; + $chg_output =~ s/sam$/txt/; + $chg_output =~ s/bam$/txt/; + $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); + $chg_output = $output_dir . $chg_output; + + if ($gzip){ + $chg_output .= '.gz'; + open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); + } + else{ + open ($fhs{CHG_context},"| cat > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); + # open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only); + push @sorting_files,$chg_output; + + unless($no_header){ + print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in CHH context + $chh_output =~ s/^/CHH_context_/; + $chh_output =~ s/sam$/txt/; + $chh_output =~ s/bam$/txt/; + $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); + $chh_output = $output_dir . $chh_output; + + if ($gzip){ + $chh_output .= '.gz'; + open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); + } + else{ + open ($fhs{CHH_context},"| cat > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); + # open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only); + push @sorting_files, $chh_output; + + unless($no_header){ + print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found + else { + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CpG},"| cat > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CpG},"| cat > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CpG},"| cat > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CpG},"| cat > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in CHG context + my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; + + $chg_ot =~ s/^/CHG_OT_/; + $chg_ot =~ s/sam$/txt/; + $chg_ot =~ s/bam$/txt/; + $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); + $chg_ot = $output_dir . $chg_ot; + + if ($gzip){ + $chg_ot .= '.gz'; + open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CHG},"| cat > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only); + push @sorting_files,$chg_ot; + + unless($no_header){ + print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ctot =~ s/^/CHG_CTOT_/; + $chg_ctot =~ s/sam$/txt/; + $chg_ctot =~ s/bam$/txt/; + $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); + $chg_ctot = $output_dir . $chg_ctot; + + if ($gzip){ + $chg_ctot .= '.gz'; + open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CHG},"| cat > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only); + push @sorting_files,$chg_ctot; + + unless($no_header){ + print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ctob =~ s/^/CHG_CTOB_/; + $chg_ctob =~ s/sam$/txt/; + $chg_ctob =~ s/bam$/txt/; + $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); + $chg_ctob = $output_dir . $chg_ctob; + + if ($gzip){ + $chg_ctob .= '.gz'; + open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CHG},"| cat > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only); + push @sorting_files,$chg_ctob; + + unless($no_header){ + print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ob =~ s/^/CHG_OB_/; + $chg_ob =~ s/sam$/txt/; + $chg_ob =~ s/bam$/txt/; + $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); + $chg_ob = $output_dir . $chg_ob; + + if ($gzip){ + $chg_ob .= '.gz'; + open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CHG},"| cat > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only); + push @sorting_files,$chg_ob; + + unless($no_header){ + print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in CHH context + my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; + + $chh_ot =~ s/^/CHH_OT_/; + $chh_ot =~ s/sam$/txt/; + $chh_ot =~ s/bam$/txt/; + $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); + $chh_ot = $output_dir . $chh_ot; + + if ($gzip){ + $chh_ot .= '.gz'; + open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CHH},"| cat > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + # open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only); + push @sorting_files,$chh_ot; + + unless($no_header){ + print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ctot =~ s/^/CHH_CTOT_/; + $chh_ctot =~ s/sam$/txt/; + $chh_ctot =~ s/bam$/txt/; + $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); + $chh_ctot = $output_dir . $chh_ctot; + + if ($gzip){ + $chh_ctot .= '.gz'; + open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CHH},"| cat > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + # open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only); + push @sorting_files,$chh_ctot; + + unless($no_header){ + print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ctob =~ s/^/CHH_CTOB_/; + $chh_ctob =~ s/sam$/txt/; + $chh_ctob =~ s/bam$/txt/; + $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); + $chh_ctob = $output_dir . $chh_ctob; + + if ($gzip){ + $chh_ctob .= '.gz'; + open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CHH},"| cat > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + # open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only); + push @sorting_files,$chh_ctob; + + unless($no_header){ + print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ob =~ s/^/CHH_OB_/; + $chh_ob =~ s/sam$/txt/; + $chh_ob =~ s/bam$/txt/; + $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); + $chh_ob = $output_dir . $chh_ob; + + if ($gzip){ + $chh_ob .= '.gz'; + open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CHH},"| cat > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + # open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only); + push @sorting_files,$chh_ob; + + unless($no_header){ + print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + return $report_filename; +} + +sub isBam{ + + my $filename = shift; + + # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy) + open (DISGUISE,"zcat $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n"; + + ### when BAM files read through a zcat stream they start with BAM... + my $bam_in_disguise = ; + # warn "BAM in disguise: $bam_in_disguise\n\n"; + + if ($bam_in_disguise){ + if ($bam_in_disguise =~ /^BAM/){ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 1; + } + else{ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 0; + } + } + else{ + close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n"; + return 0; + } +} + + +sub print_helpfile{ + + print << 'HOW_TO'; + + +DESCRIPTION + +The following is a brief description of all options to control the Bismark +methylation extractor. The script reads in a bisulfite read alignment results file +produced by the Bismark bisulfite mapper and extracts the methylation information +for individual cytosines. This information is found in the methylation call field +which can contain the following characters: + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~~~ X for methylated C in CHG context ~~~ + ~~~ x for not methylated C CHG ~~~ + ~~~ H for methylated C in CHH context ~~~ + ~~~ h for not methylated C in CHH context ~~~ + ~~~ Z for methylated C in CpG context ~~~ + ~~~ z for not methylated C in CpG context ~~~ + ~~~ U for methylated C in Unknown context (CN or CHN ~~~ + ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~ + ~~~ . for any bases not involving cytosines ~~~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The methylation extractor outputs result files for cytosines in CpG, CHG and CHH +context (this distinction is actually already made in Bismark itself). As the methylation +information for every C analysed can produce files which easily have tens or even hundreds of +millions of lines, file sizes can become very large and more difficult to handle. The C +methylation info additionally splits cytosine methylation calls up into one of the four possible +strands a given bisulfite read aligned against: + + OT original top strand + CTOT complementary to original top strand + + OB original bottom strand + CTOB complementary to original bottom strand + +Thus, by default twelve individual output files are being generated per input file (unless +--comprehensive is specified, see below). The output files can be imported into a genome +viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact +unless the bisulfite reads were generated preserving directionality it doesn't make any +sense to look at the data in a strand-specific manner). Strand-specific output files can +optionally be skipped, in which case only three output files for CpG, CHG or CHH context +will be generated. For both the strand-specific and comprehensive outputs there is also +the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. + + +The output files are in the following format (tab delimited): + + + + +USAGE: methylation_extractor [options] + + +ARGUMENTS: +========== + + A space-separated list of Bismark result files in SAM format from + which methylation information is extracted for every cytosine in + the reads. For alignment files in the older custom Bismark output + see option '--vanilla'. + +OPTIONS: + +-s/--single-end Input file(s) are Bismark result file(s) generated from single-end + read data. Specifying either --single-end or --paired-end is + mandatory. + +-p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end + read data. Specifying either --paired-end or --single-end is + mandatory. + +--vanilla The Bismark result input file(s) are in the old custom Bismark format + (up to version 0.5.x) and not in SAM format which is the default as + of Bismark version 0.6.x or higher. Default: OFF. + +--no_overlap For paired-end reads it is theoretically possible that read_1 and + read_2 overlap. This option avoids scoring overlapping methylation + calls twice (only methylation calls of read 1 are used for in the process + since read 1 has historically higher quality basecalls than read 2). + Whilst this option removes a bias towards more methylation calls + in the center of sequenced fragments it may de facto remove a sizable + proportion of the data. This option is on by default for paired-end data + but can be disabled using --include_overlap. Default: ON. + +--include_overlap For paired-end data all methylation calls will be extracted irrespective of + of whether they overlap or not. Default: OFF. + +--ignore Ignore the first bp from the 5' end of Read 1 (or single-end alignment + files) when processing the methylation call string. This can remove e.g. a + restriction enzyme site at the start of each read or any other source of + bias (such as PBAT-Seq data). + +--ignore_r2 Ignore the first bp from the 5' end of Read 2 of paired-end sequencing + results only. Since the first couple of bases in Read 2 of BS-Seq experiments + show a severe bias towards non-methylation as a result of end-repairing + sonicated fragments with unmethylated cytosines (see M-bias plot), it is + recommended that the first couple of bp of Read 2 are removed before + starting downstream analysis. Please see the section on M-bias plots in the + Bismark User Guide for more details. + +--ignore_3prime Ignore the last bp from the 3' end of Read 1 (or single-end alignment + files) when processing the methylation call string. This can remove unwanted + biases from the end of reads. + +--ignore_3prime_r2 Ignore the last bp from the 3' end of Read 2 of paired-end sequencing + results only. This can remove unwanted biases from the end of reads. + +--comprehensive Specifying this option will merge all four possible strand-specific + methylation info into context-dependent output files. The default + + contexts are: + - CpG context + - CHG context + - CHH context + +--merge_non_CpG This will produce two output files (in --comprehensive mode) or eight + strand-specific output files (default) for Cs in + - CpG context + - non-CpG context + +--report Prints out a short methylation summary as well as the paramaters used to run + this script. Default: ON. + +--no_header Suppresses the Bismark version header line in all output files for more convenient + batch processing. + +-o/--output DIR Allows specification of a different output directory (absolute or relative + path). If not specified explicitely, the output will be written to the current directory. + +--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified + explicitly if Samtools is in the PATH already. + +--gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in + a GZIP compressed form to save disk space. This option does not work on bedGraph and + genome-wide cytosine reports as they are 'tiny' anyway. + +--version Displays version information. + +-h/--help Displays this help file and exits. + +--mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as + well as a report (optional) and then quit. Default: OFF. + +--mbias_off The methylation extractor will process the entire file as usual but doesn't write out any M-bias report. + Only recommended for users who deliberately want to keep an earlier version of the M-bias report. + Default: OFF. + +--multicore Sets the number of cores to be used for the methylation extraction process. If system resources + are plentiful this is a viable option to speed up the extraction process (we observed a near linear + speed increase for up to 10 cores used). Please note that a typical process of extracting a BAM file + and writing out '.gz' output streams will in fact use ~3 cores per value of --multicore + specified (1 for the methylation extractor itself, 1 for a Samtools stream, 1 for GZIP stream), so + --multicore 10 is likely to use around 30 cores of system resources. This option has no bearing + on the bismark2bedGraph or genome-wide cytosine report processes. + + + + +bedGraph specific options: +========================== + +--bedGraph After finishing the methylation extraction, the methylation output is written into a + sorted bedGraph file that reports the position of a given cytosine and its methylation + state (in %, see details below). The methylation extractor output is temporarily split up into + temporary files, one per chromosome (written into the current directory or folder + specified with -o/--output); these temp files are then used for sorting and deleted + afterwards. By default, only cytosines in CpG context will be sorted. The option + '--CX_context' may be used to report all cytosines irrespective of sequence context + (this will take MUCH longer!). The default folder for temporary files during the sorting + process is the output directory. The bedGraph conversion step is performed by the external + module 'bismark2bedGraph'; this script needs to reside in the same folder as the + bismark_methylation_extractor itself. + +--zero_based Write out an additional coverage file (ending in .zero.cov) that uses 0-based genomic start + and 1-based genomic end coordinates (zero-based, half-open), like used in the bedGraph file, + instead of using 1-based coordinates throughout. Default: OFF. + + +--cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide + before its methylation percentage is reported. Default: 1. + +--remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. + +--CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered + in the experiment irrespective of its sequence context. This applies to both forward and + reverse strands. Please be aware that this option may generate large temporary and output files + and may take a long time to sort (up to many hours). Default: OFF. + (i.e. Default = CpG context only). + +--buffer_size This allows you to specify the main memory sort buffer when sorting the methylation information. + Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or + a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. + (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. + Defaults to 2G. + +--scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of + scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to + individual chromosome files. These errors were caused by the operating system's limit + of the number of filehandle that can be written to at any one time (typically 1024; to + find out this limit on Linux, type: ulimit -a). + To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort + methylation calls into individual chromosome files. Instead, all input files are + temporarily merged into a single file (unless there is only a single file), and this + file will then be sorted by both chromosome AND position using the Unix sort command. + Please be aware that this option might take a looooong time to complete, depending on + the size of the input files, and the memory you allocate to this process (see --buffer_size). + Nevertheless, it seems to be working. + +--ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will + instead use two arrays to sort methylated and unmethylated calls. This may result in a faster + sorting process of very large files, but this comes at the cost of a larger memory footprint + (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB + of RAM). Due to overheads in creating and looping through these arrays it seems that it will + actually be *slower* for small files (few million alignments), and we are currently testing at + which point it is advisable to use this option. Note that --ample_memory is not compatible + with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with). + + + +Genome-wide cytosine methylation report specific options: +========================================================= + +--cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a + genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based + chromosome coordinates (zero-based start coords are optional) and reports CpG context only (all + cytosine context is optional). The output considers all Cs on both forward and reverse strands and + reports their position, strand, trinucleotide content and methylation state (counts are 0 if not + covered). The cytosine report conversion step is performed by the external module + 'coverage2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor + itself. + +--CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of + its context. This applies to both forward and reverse strands. Please be aware that this will + generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. + Default: OFF (i.e. Default = CpG context only). + +--zero_based Uses 0-based genomic coordinates instead of 1-based coordinates. Default: OFF. + +--genome_folder Enter the genome folder you wish to use to extract sequences from (full path only). Accepted + formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. + +--split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files + will be named to include the input filename and the chromosome number. + + + +OUTPUT: + +The bismark_methylation_extractor output is in the form: +======================================================== + + +* Methylated cytosines receive a '+' orientation, +* Unmethylated cytosines receive a '-' orientation. + + + +The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords): +========================================================================================================= + +track type=bedGraph (header line) + + + + + +The coverage output looks like this (tab-delimited, 1-based genomic coords; zero-based half-open coordinates available with '--zero_based'): +============================================================================================================================================ + + + + + +The genome-wide cytosine methylation output file is tab-delimited in the following format: +========================================================================================== + + + + +This script was last modified on 22 April 2015. + +HOW_TO +} diff -r 0f8646f22b8d -r fcadce4d9a06 old/bismark --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/old/bismark Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,7959 @@ +#!/usr/bin/perl -- +use strict; +use warnings; +use IO::Handle; +use Cwd; +$|++; +use Getopt::Long; + + +## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + + +my $parent_dir = getcwd; +my $bismark_version = 'v0.10.0'; +my $command_line = join (" ",@ARGV); + +### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail +foreach my $arg (@ARGV){ + if ($arg eq '--solexa1.3-quals'){ + $arg = '--phred64-quals'; + } +} +my @filenames; # will be populated by processing the command line + +my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line(); + +my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment +my %chromosomes; # stores the chromosome sequences of the mouse genome +my %counting; # counting various events + +my $seqID_contains_tabs; + +foreach my $filename (@filenames){ + + chdir $parent_dir or die "Unable to move to initial working directory $!\n"; + ### resetting the counting hash and fhs + reset_counters_and_fhs($filename); + $seqID_contains_tabs = 0; + + ### PAIRED-END ALIGNMENTS + if ($filename =~ ','){ + my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file + + $fhs[0]->{name} = 'CTread1GAread2CTgenome'; + $fhs[1]->{name} = 'GAread1CTread2GAgenome'; + $fhs[2]->{name} = 'GAread1CTread2CTgenome'; + $fhs[3]->{name} = 'CTread1GAread2GAgenome'; + + warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n"; + + my ($filename_1,$filename_2) = (split (/,/,$filename)); + warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n"; + + ### additional variables only for paired-end alignments + my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file + + ### FastA format + if ($sequence_file_format eq 'FASTA'){ + warn "Input files are in FastA format\n"; + + if ($directional){ + ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + + if ($bowtie2){ + paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + else{ + paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + } + + ### FastQ format + else{ + warn "Input files are in FastQ format\n"; + if ($directional){ + if ($bowtie2){ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ # Bowtie 1 alignments + if ($gzip){ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format + $fhs[0]->{inputfile_2} = undef; # no longer needed + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format + $fhs[3]->{inputfile_2} = undef; # no longer needed + } + else{ + ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = undef; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = undef; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + } + } + elsif($pbat){ # PBAT-Seq + ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1 + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = undef; + $fhs[0]->{inputfile_2} = undef; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = undef; + $fhs[3]->{inputfile_2} = undef; + } + else{ + if ($bowtie2){ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + else{ # Bowtie 1 alignments + if ($gzip){ + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = undef; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = undef; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files + } + else{ #uncompressed temp files + ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number + ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); + + $fhs[0]->{inputfile_1} = $C_to_T_infile_1; + $fhs[0]->{inputfile_2} = $G_to_A_infile_2; + $fhs[1]->{inputfile_1} = $G_to_A_infile_1; + $fhs[1]->{inputfile_2} = $C_to_T_infile_2; + $fhs[2]->{inputfile_1} = $G_to_A_infile_1; + $fhs[2]->{inputfile_2} = $C_to_T_infile_2; + $fhs[3]->{inputfile_1} = $C_to_T_infile_1; + $fhs[3]->{inputfile_2} = $G_to_A_infile_2; + } + } + } + if ($bowtie2){ + paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + else{ + paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + } + start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + + ### Else we are performing SINGLE-END ALIGNMENTS + else{ + warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n"; + ### Initialising bisulfite conversion filenames + my ($C_to_T_infile,$G_to_A_infile); + + + ### FastA format + if ($sequence_file_format eq 'FASTA'){ + warn "Inut file is in FastA format\n"; + if ($directional){ + ($C_to_T_infile) = biTransformFastAFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + } + else{ + ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; + } + + ### Creating 4 different bowtie filehandles and storing the first entry + if ($bowtie2){ + single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile); + } + } + + ## FastQ format + else{ + warn "Input file is in FastQ format\n"; + if ($directional){ + ($C_to_T_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + } + elsif($pbat){ + ($G_to_A_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files + } + else{ + ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename); + $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; + $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; + } + + ### Creating up to 4 different bowtie filehandles and storing the first entry + if ($bowtie2){ + single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile); + } + elsif ($pbat){ + single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile); + } + else{ + single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile); + } + } + + start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile); + + } +} + +sub start_methylation_call_procedure_single_ends { + my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; + my ($dir,$filename); + + if ($sequence_file =~ /\//){ + ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $sequence_file; + } + + ### printing all alignments to a results file + my $outfile = $filename; + if ($prefix){ + $outfile = "$prefix.$outfile"; + } + + + if ($bowtie2){ # SAM format is the default for Bowtie 2 + $outfile =~ s/$/_bismark_bt2.sam/; + } + elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) + $outfile =~ s/$/_bismark.txt/; + } + else{ # SAM is the default output + $outfile =~ s/$/_bismark.sam/; + } + + $bam = 0 unless (defined $bam); + + if ($bam == 1){ ### Samtools is installed, writing out BAM directly + $outfile =~ s/sam/bam/; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead + $outfile .= '.gz'; + open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ # uncompressed ouput, default + open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + + warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n"; + sleep(1); + + if ($vanilla){ + print OUT "Bismark version: $bismark_version\n"; + } + + ### printing alignment and methylation call summary to a report file + my $reportfile = $filename; + if ($prefix){ + $reportfile = "$prefix.$reportfile"; + } + + if ($bowtie2){ + $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; + } + else{ + $reportfile =~ s/$/_bismark_SE_report.txt/; + } + + open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; + print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; + + if ($unmapped){ + my $unmapped_file = $filename; + if ($prefix){ + $unmapped_file = "$prefix.$unmapped_file"; + } + + $unmapped_file =~ s/$/_unmapped_reads.txt/; + open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; + print "Unmapped sequences will be written to $output_dir$unmapped_file\n"; + } + if ($ambiguous){ + my $ambiguous_file = $filename; + if ($prefix){ + $ambiguous_file = "$prefix.$ambiguous_file"; + } + $ambiguous_file =~ s/$/_ambiguous_reads.txt/; + open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; + print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; + } + + if ($directional){ + print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n"; + } + print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + + + ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time + unless (%chromosomes){ + my $cwd = getcwd; # storing the path of the current working directory + print "Current working directory is: $cwd\n\n"; + read_genome_into_memory($cwd); + } + + unless ($vanilla or $sam_no_hd){ + generate_SAM_header(); + } + + ### Input file is in FastA format + if ($sequence_file_format eq 'FASTA'){ + process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); + } + ### Input file is in FastQ format + else{ + process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); + } +} + +sub start_methylation_call_procedure_paired_ends { + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + + my ($dir_1,$filename_1); + + if ($sequence_file_1 =~ /\//){ + ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/; + } + else{ + $filename_1 = $sequence_file_1; + } + + my ($dir_2,$filename_2); + + if ($sequence_file_2 =~ /\//){ + ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/; + } + else{ + $filename_2 = $sequence_file_2; + } + + ### printing all alignments to a results file + my $outfile = $filename_1; + + if ($prefix){ + $outfile = "$prefix.$outfile"; + } + + if ($bowtie2){ # SAM format is the default Bowtie 2 output + $outfile =~ s/$/_bismark_bt2_pe.sam/; + } + elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) + $outfile =~ s/$/_bismark_pe.txt/; + } + else{ # SAM format is the default Bowtie 1 output + $outfile =~ s/$/_bismark_pe.sam/; + } + + $bam = 0 unless (defined $bam); + + if ($bam == 1){ ### Samtools is installed, writing out BAM directly + $outfile =~ s/sam/bam/; + open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead + $outfile .= '.gz'; + open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + else{ # uncompressed ouput, default + open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + } + + warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n"; + sleep(1); + + if ($vanilla){ + print OUT "Bismark version: $bismark_version\n"; + } + + ### printing alignment and methylation call summary to a report file + my $reportfile = $filename_1; + if ($prefix){ + $reportfile = "$prefix.$reportfile"; + } + + if ($bowtie2){ + $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; + } + else{ + $reportfile =~ s/$/_bismark_PE_report.txt/; + } + + open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; + print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n"; + print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + + + ### Unmapped read output + if ($unmapped){ + my $unmapped_1 = $filename_1; + my $unmapped_2 = $filename_2; + if ($prefix){ + $unmapped_1 = "$prefix.$unmapped_1"; + $unmapped_2 = "$prefix.$unmapped_2"; + } + $unmapped_1 =~ s/$/_unmapped_reads_1.txt/; + $unmapped_2 =~ s/$/_unmapped_reads_2.txt/; + open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; + open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; + print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; + } + + if ($ambiguous){ + my $amb_1 = $filename_1; + my $amb_2 = $filename_2; + if ($prefix){ + $amb_1 = "$prefix.$amb_1"; + $amb_2 = "$prefix.$amb_2"; + } + + $amb_1 =~ s/$/_ambiguous_reads_1.txt/; + $amb_2 =~ s/$/_ambiguous_reads_2.txt/; + open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; + open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; + print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; + } + + if ($directional){ + print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n"; + } + + ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time + unless (%chromosomes){ + my $cwd = getcwd; # storing the path of the current working directory + print "Current working directory is: $cwd\n\n"; + read_genome_into_memory($cwd); + } + + unless ($vanilla or $sam_no_hd){ + generate_SAM_header(); + } + + ### Input files are in FastA format + if ($sequence_file_format eq 'FASTA'){ + process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } + ### Input files are in FastQ format + else{ + process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + } +} + +sub print_final_analysis_report_single_end{ + my ($C_to_T_infile,$G_to_A_infile) = @_; + ### All sequences from the original sequence file have been analysed now + ### deleting temporary C->T or G->A infiles + + if ($directional){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n"; + } + else{ + warn "Could not delete temporary file $C_to_T_infile properly $!\n"; + } + } + elsif ($pbat){ + my $deletion_successful = unlink "$temp_dir$G_to_A_infile"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n"; + } + else{ + warn "Could not delete temporary file $G_to_A_infile properly $!\n"; + } + } + else{ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n"; + } + else{ + warn "Could not delete temporary files properly $!\n"; + } + } + + ### printing a final report for the alignment procedure + print REPORT "Final Alignment report\n",'='x22,"\n"; + warn "Final Alignment report\n",'='x22,"\n"; + # foreach my $index (0..$#fhs){ + # print "$fhs[$index]->{name}\n"; + # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; + # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; + # } + + ### printing a final report for the methylation call procedure + warn "Sequences analysed in total:\t$counting{sequences_count}\n"; + print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n"; + my $percent_alignable_sequences; + + if ($counting{sequences_count} == 0){ + $percent_alignable_sequences = 0; + } + else{ + $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); + } + + warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n"; + print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n"; + + ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads), + ### only calculating the percentage if there were any overruled alignments + if ($counting{low_complexity_alignments_overruled_count}){ + my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count}); + # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n"; + } + + print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; + print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; + + print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; + print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; + + if ($directional){ + print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + } + + ### detailed information about Cs analysed + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + warn "\n"; + + warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + warn "\n"; + + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + print REPORT "\n"; + + print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + print REPORT "\n"; + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meC_unknown; + if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ + $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); + } + + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### printing methylated C percentage (CHG context) if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated C percentage (CHH context) if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + } + + ### printing methylated C percentage (Unknown C context) if applicable + if ($bowtie2){ + if ($percent_meC_unknown){ + warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; + } + } + print REPORT "\n\n"; + warn "\n\n"; + + if ($seqID_contains_tabs){ + warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; + print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; + } + + + ########################################################################################################################################### + ### create pie-chart with mapping stats + ########################################################################################################################################### + + + my $filename; + if ($pbat){ + $filename = $G_to_A_infile; + } + else{ + $filename = $C_to_T_infile; + } + + my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $pie_chart =~ s/gz$//; + $pie_chart =~ s/_C_to_T\.fastq$//; + $pie_chart =~ s/_G_to_A\.fastq$//; + + # if ($prefix){ + # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation + # } + $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png"; + + + #Check whether the module GD::Graph is installed + my $gd_graph_installed = 0; + eval{ + require GD::Graph::pie; + GD::Graph::pie->import(); + }; + + unless($@) { + $gd_graph_installed = 1; + } + else{ + warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; + sleep(2); + } + + if ($gd_graph_installed){ + warn "Generating pie chart\n\n"; + sleep(1); + my $graph = GD::Graph::pie->new(600,600); + + my $percent_unaligned; + my $percent_multiple; + my $percent_unextractable; + + if ($counting{sequences_count}){ + $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); + $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); + $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); + } + else{ + $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; + } + + my @aln_stats = ( + ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], + [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], + ); + + $graph->set( + start_angle => 180, + '3d' => 0, + label => 'Alignment stats (single-end)', + suppress_angle => 2, # Only label slices of sufficient size + transparent => 0, + dclrs => [ qw(red lorange dgreen cyan) ], + ) or die $graph->error; + + my $gd = $graph->plot(\@aln_stats) or die $graph->error; + + open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; + binmode PIE; + print PIE $gd->png; + } + + warn "====================\nBismark run complete\n====================\n\n"; + +} + + +sub print_final_analysis_report_paired_ends{ + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles + if ($directional){ + if ($G_to_A_infile_2){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n"; + } + else{ + warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n"; + } + } + else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1"; + if ($deletion_successful == 1){ + warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n"; + } + else{ + warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n"; + } + } + } + else{ + if ($G_to_A_infile_2 and $C_to_T_infile_2){ + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2"; + if ($deletion_successful == 4){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n"; + } + else{ + warn "Could not delete temporary files properly: $!\n"; + } + } + else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete + my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1"; + if ($deletion_successful == 2){ + warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n"; + } + else{ + warn "Could not delete temporary files properly: $!\n"; + } + } + } + + ### printing a final report for the alignment procedure + warn "Final Alignment report\n",'='x22,"\n"; + print REPORT "Final Alignment report\n",'='x22,"\n"; + # foreach my $index (0..$#fhs){ + # print "$fhs[$index]->{name}\n"; + # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; + # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; + # } + + ### printing a final report for the methylation call procedure + warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; + print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; + + my $percent_alignable_sequence_pairs; + if ($counting{sequences_count} == 0){ + $percent_alignable_sequence_pairs = 0; + } + else{ + $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); + } + print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n"; + print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n"; + + print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; + print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; + + + print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; + print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; + print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; + print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; + print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; + ### detailed information about Cs analysed + + if ($directional){ + print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; + } + + warn "Final Cytosine Methylation Report\n",'='x33,"\n"; + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + warn "Total number of C's analysed:\t$total_number_of_C\n\n"; + warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + warn "\n"; + + warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + warn "\n"; + + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; + } + print REPORT "\n"; + + print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; + } + print REPORT "\n"; + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meC_unknown; + if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ + $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); + } + + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + warn "C methylated in CpG context:\t${percent_meCpG}%\n"; + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### printing methylated C percentage in CHG context if applicable + if ($percent_meCHG){ + warn "C methylated in CHG context:\t${percent_meCHG}%\n"; + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated C percentage in CHH context if applicable + if ($percent_meCHH){ + warn "C methylated in CHH context:\t${percent_meCHH}%\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + } + + ### printing methylated C percentage (Unknown C context) if applicable + if ($bowtie2){ + if ($percent_meC_unknown){ + warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; + } + } + print REPORT "\n\n"; + warn "\n\n"; + + + ############################################################################################################################################ + ### create pie-chart with mapping stats + ########################################################################################################################################### + + my $filename; + if ($pbat){ + $filename = $G_to_A_infile_1; + } + else{ + $filename = $C_to_T_infile_1; + } + + my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $pie_chart =~ s/gz$//; + $pie_chart =~ s/_C_to_T.fastq$//; + $pie_chart =~ s/_G_to_A.fastq$//; + ### special format for gzipped PE Bowtie1 files + $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//; + $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//; + + if ($prefix){ + # prefix is now being prepended to the temp files already + # $pie_chart = "$prefix.$pie_chart"; + } + $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png"; + + #Check whether the module GD::Graph is installed + my $gd_graph_installed = 0; + eval{ + require GD::Graph::pie; + GD::Graph::pie->import(); + }; + + unless($@) { + $gd_graph_installed = 1; + } + else{ + warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; + sleep(2); + } + + if ($gd_graph_installed){ + warn "Generating pie chart\n\n"; + sleep(1); + my $graph = GD::Graph::pie->new(600,600); + + my $percent_unaligned; + my $percent_multiple; + my $percent_unextractable; + + if ($counting{sequences_count}){ + $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); + $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); + $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); + } + else{ + $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; + } + + my @aln_stats = ( + ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], + [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], + ); + + # push @{$mbias_read1[0]},$pos; + + $graph->set( + start_angle => 180, + '3d' => 0, + label => 'Alignment stats (paired-end)', + suppress_angle => 2, # Only label slices of sufficient size + transparent => 0, + dclrs => [ qw(red lorange dgreen cyan) ], + ) or die $graph->error; + + my $gd = $graph->plot(\@aln_stats) or die $graph->error; + + open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; + binmode PIE; + print PIE $gd->png; + } + + warn "====================\nBismark run complete\n====================\n\n"; + +} + +sub process_single_end_fastA_file_for_methylation_call{ + my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; + ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. + ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either + ### the C->T or G->A version + + ### gzipped version of the infile + if ($sequence_file =~ /\.gz$/){ + open (IN,"zcat $sequence_file |") or die $!; + } + else{ + open (IN,$sequence_file) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence file $sequence_file\n"; + while (1) { + # last if ($counting{sequences_count} > 100); + my $identifier = ; + my $sequence = ; + last unless ($identifier and $sequence); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + chomp $sequence; + chomp $identifier; + + $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier); + } + else{ + $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1 + } + + unless ($return){ + $return = 0; + } + + # print the sequence to ambiguous.out if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG ">$identifier\n"; + print AMBIG "$sequence\n"; + } + + # print the sequence to file if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED ">$identifier\n"; + print UNMAPPED "$sequence\n"; + } + } + print "Processed $counting{sequences_count} sequences in total\n\n"; + + print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); + +} + +sub process_single_end_fastQ_file_for_methylation_call{ + my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; + ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. + ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either + ### the C->T or G->A version + + ### gzipped version of the infile + if ($sequence_file =~ /\.gz$/){ + open (IN,"zcat $sequence_file |") or die $!; + } + else{ + open (IN,$sequence_file) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence file $sequence_file\n"; + while (1) { + my $identifier = ; + my $sequence = ; + my $identifier_2 = ; + my $quality_value = ; + last unless ($identifier and $sequence and $identifier_2 and $quality_value); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequences so far\n"; + } + chomp $sequence; + chomp $identifier; + chomp $quality_value; + + $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value); + } + else{ + $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1 + } + + unless ($return){ + $return = 0; + } + + # print the sequence to ambiguous.out if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG "\@$identifier\n"; + print AMBIG "$sequence\n"; + print AMBIG $identifier_2; + print AMBIG "$quality_value\n"; + } + + # print the sequence to file if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED "\@$identifier\n"; + print UNMAPPED "$sequence\n"; + print UNMAPPED $identifier_2; + print UNMAPPED "$quality_value\n"; + } + } + print "Processed $counting{sequences_count} sequences in total\n\n"; + + print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); + +} + +sub process_fastA_files_for_paired_end_methylation_calls{ + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to + ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping. + ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the + ### converted genomes (either the C->T or G->A version) + + ### gzipped version of the infiles + if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ + open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; + open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; + } + else{ + open (IN1,$sequence_file_1) or die $!; + open (IN2,$sequence_file_2) or die $!; + } + + warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; + ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one + + my $count = 0; + + while (1) { + # reading from the first input file + my $identifier_1 = ; + my $sequence_1 = ; + # reading from the second input file + my $identifier_2 = ; + my $sequence_2 = ; + last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2); + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + $identifier_2 = fix_IDs($identifier_2); + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequence pairs so far\n"; + } + my $orig_identifier_1 = $identifier_1; + my $orig_identifier_2 = $identifier_2; + + chomp $sequence_1; + chomp $identifier_1; + chomp $sequence_2; + chomp $identifier_2; + + $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1); + } + else{ + $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1); + } + + unless ($return){ + $return = 0; + } + + # print the sequences to ambiguous_1 and _2 if --ambiguous was specified + if ($ambiguous and $return == 2){ + print AMBIG_1 $orig_identifier_1; + print AMBIG_1 "$sequence_1\n"; + print AMBIG_2 $orig_identifier_2; + print AMBIG_2 "$sequence_2\n"; + } + + # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified + elsif ($unmapped and $return == 1){ + print UNMAPPED_1 $orig_identifier_1; + print UNMAPPED_1 "$sequence_1\n"; + print UNMAPPED_2 $orig_identifier_2; + print UNMAPPED_2 "$sequence_2\n"; + } + } + + warn "Processed $counting{sequences_count} sequences in total\n\n"; + + close OUT or die $!; + + print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + +} + +sub process_fastQ_files_for_paired_end_methylation_calls{ + my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to + ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments. + ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both) + ### of the converted genomes (either C->T or G->A version) + + ### gzipped version of the infiles + if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ + open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; + open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; + } + else{ + open (IN1,$sequence_file_1) or die $!; + open (IN2,$sequence_file_2) or die $!; + } + + my $count = 0; + + warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; + ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one + while (1) { + # reading from the first input file + my $identifier_1 = ; + my $sequence_1 = ; + my $ident_1 = ; # not needed + my $quality_value_1 = ; # not needed + # reading from the second input file + my $identifier_2 = ; + my $sequence_2 = ; + my $ident_2 = ; # not needed + my $quality_value_2 = ; # not needed + last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2); + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + $identifier_2 = fix_IDs($identifier_2); + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $counting{sequences_count}++; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequence pairs so far\n"; + } + + my $orig_identifier_1 = $identifier_1; + my $orig_identifier_2 = $identifier_2; + + chomp $sequence_1; + chomp $identifier_1; + chomp $sequence_2; + chomp $identifier_2; + chomp $quality_value_1; + chomp $quality_value_2; + + $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID + + my $return; + if ($bowtie2){ + $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); + } + else{ + $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); + } + + unless ($return){ + $return = 0; + } + + # print the sequences to ambiguous_1 and _2 if --ambiguous was specified + if ($ambiguous and $return == 2){ + # seq_1 + print AMBIG_1 $orig_identifier_1; + print AMBIG_1 "$sequence_1\n"; + print AMBIG_1 $ident_1; + print AMBIG_1 "$quality_value_1\n"; + # seq_2 + print AMBIG_2 $orig_identifier_2; + print AMBIG_2 "$sequence_2\n"; + print AMBIG_2 $ident_2; + print AMBIG_2 "$quality_value_2\n"; + } + + # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified + elsif ($unmapped and $return == 1){ + # seq_1 + print UNMAPPED_1 $orig_identifier_1; + print UNMAPPED_1 "$sequence_1\n"; + print UNMAPPED_1 $ident_1; + print UNMAPPED_1 "$quality_value_1\n"; + # seq_2 + print UNMAPPED_2 $orig_identifier_2; + print UNMAPPED_2 "$sequence_2\n"; + print UNMAPPED_2 $ident_2; + print UNMAPPED_2 "$quality_value_2\n"; + } + } + + warn "Processed $counting{sequences_count} sequences in total\n\n"; + + close OUT or die $!; + + print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); + +} + +sub check_bowtie_results_single_end{ + my ($sequence,$identifier,$quality_value) = @_; + + unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout + $quality_value = 'I'x(length$sequence); + } + + my %mismatches = (); + ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome + foreach my $index (0..$#fhs){ + + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); + ### if the sequence we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + ############################################################### + ### STEP I Now processing the alignment stored in last_line ### + ############################################################### + my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier); + ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation + ### we only continue to extract useful information about this alignment if 1 was returned + if ($valid_alignment_found_1 == 1){ + ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself + ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) + my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; + + unless($mismatch_info){ + $mismatch_info = ''; + } + + chomp $mismatch_info; + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches; + if ($mismatch_info eq ''){ + $number_of_mismatches = 0; + } + elsif ($mismatch_info =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info); + $number_of_mismatches = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n"; + } + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + my $alignment_location = join (":",$chromosome,$position); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ + $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; + $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; + $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; + $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; + } + $number_of_mismatches = undef; + ################################################################################################################################################## + ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a + ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will + ### be returned as $valid_alignment_found and it will then be processed in the next round only. + ################################################################################################################################################## + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else { + # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier); + ### we only continue to extract useful information about this second alignment if 1 was returned + if ($valid_alignment_found_2 == 1){ + ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself + ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) + my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; + unless($mismatch_info){ + $mismatch_info = ''; + } + chomp $mismatch_info; + + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches; + if ($mismatch_info eq ''){ + $number_of_mismatches = 0; + } + elsif ($mismatch_info =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info); + $number_of_mismatches = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + ### extracting the chromosome number from the bowtie output (see above) + my $alignment_location = join (":",$chromosome,$position); + ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position + ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this + ### case we are not writing the same entry out a second time. + unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ + $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; + $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; + $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; + $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; + } + #################################################################################################################################### + #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ### + #################################################################################################################################### + $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + next; + } + else { + # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + ### still within the 2nd sequence in correct orientation found + } + ### still withing the 1st sequence in correct orientation found + } + ### still within the if (last_seq_id eq identifier) condition + } + ### still within foreach index loop + } + ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file + unless(%mismatches){ + $counting{no_single_alignment_found}++; + if ($unmapped){ + return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified + } + else{ + return; + } + } + ####################################################################################################################################################### + ####################################################################################################################################################### + ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### + ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### + ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ### + ####################################################################################################################################################### + ####################################################################################################################################################### + ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + my $sequence_fails = 0; + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference! + ### sorting in ascending order + foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ + + ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment + if (scalar keys %{$mismatches{$mismatch_number}} == 1){ + for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position}; + $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; + } + } + elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){ + ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and + ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a + ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome + ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite + ### reaction. E.g. + ### CAGTCACGCGCGCGCG will become + ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition. + ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave + ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!: + ### G->A conversion: + ### highly methylated: CAATCACACACACACA + ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce + ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the + ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted + ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts. + ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions + ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of + ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment. + ### In the above example the number of transliterations required to transform the actual sequence + ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment) + ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments) + ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments + ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed + ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be + ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded. + my @three_candidate_seqs; + foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){ + my $transliterations_performed; + if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){ + $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT'); + } + elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){ + $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA'); + } + else{ + die "unexpected index number range $!\n"; + } + push @three_candidate_seqs,{ + index =>$mismatches{$mismatch_number}->{$composite_location}->{index}, + bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence}, + mismatch_number => $mismatch_number, + chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome}, + position => $mismatches{$mismatch_number}->{$composite_location}->{position}, + seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id}, + transliterations_performed => $transliterations_performed, + }; + } + ### sorting in ascending order for the lowest number of transliterations performed + @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs; + my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed}; + my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed}; + my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed}; + # print "$first_array_element\t$second_array_element\t$third_array_element\n"; + if (($first_array_element*2) < $second_array_element){ + $counting{low_complexity_alignments_overruled_count}++; + ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits + $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position}; + $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; + # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n"; + } + else{ + $sequence_fails = 1; + } + } + else{ + $sequence_fails = 1; + } + ### after processing the alignment with the lowest number of mismatches we exit + last; + } + ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions + if ($sequence_fails == 1){ + $counting{unsuitable_sequence_count}++; + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified + } + if ($unmapped){ + return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it will have a unique best alignment + $counting{unique_best_alignment_count}++; + if ($pbat){ + extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params); + } + else{ + extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params); + } + + ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); + + print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value); + return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out +} + +sub check_bowtie_results_single_end_bowtie2{ + my ($sequence,$identifier,$quality_value) = @_; + + + unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout + $quality_value = 'I'x(length$sequence); + } + + # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name. + # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs + # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n"; + + my $alignment_ambiguous = 0; + + my %alignments = (); + + ### reading from the Bowtie 2 output filehandles + foreach my $index (0..$#fhs){ + # print "Index: $index\n"; + # print "$fhs[$index]->{last_line}\n"; + # print "$fhs[$index]->{last_seq_id}\n"; + # sleep (1); + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); + + ### if the sequence we are currently looking at produced an alignment we are doing various things with it + # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n"; + + if ($fhs[$index]->{last_seq_id} eq $identifier) { + # SAM format specifications for Bowtie 2 + # (1) Name of read that aligned + # (2) Sum of all applicable flags. Flags relevant to Bowtie are: + # 1 The read is one of a pair + # 2 The alignment is one end of a proper paired-end alignment + # 4 The read has no reported alignments + # 8 The read is one of a pair and has no reported alignments + # 16 The alignment is to the reverse reference strand + # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand + # 64 The read is mate 1 in a pair + # 128 The read is mate 2 in a pair + # 256 The read has multiple mapping states + # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) + # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) + # (5) Mapping quality (255 means MAPQ is not available) + # (6) CIGAR string representation of alignment (* if unavailable) + # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. + # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. + # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. + # (10) Read sequence (reverse-complemented if aligned to the reverse strand) + # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. + # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: + # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. + # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. + # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. + # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. + # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. + # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. + # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. + # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. + + my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10]; + + ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance + if ($flag == 4){ + ## reading in the next alignment, which must be the next sequence + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + if ($seq_id eq $identifier){ + die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + next; # next instance + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + next; + } + } + + # if there are one or more proper alignments we can extract the chromosome number + my $chromosome; + if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ + $chromosome = $mapped_chromosome; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome\n"; + } + + ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string + my ($alignment_score,$second_best,$MD_tag); + my @fields = split (/\t/,$fhs[$index]->{last_line}); + + foreach (11..$#fields){ + if ($fields[$_] =~ /AS:i:(.*)/){ + $alignment_score = $1; + } + elsif ($fields[$_] =~ /XS:i:(.*)/){ + $second_best = $1; + } + elsif ($fields[$_] =~ /MD:Z:(.*)/){ + $MD_tag = $1; + } + } + + # warn "First best alignment_score is: '$alignment_score'\n"; + # warn "MD tag is: '$MD_tag'\n"; + die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag); + + if (defined $second_best){ + # warn "second best alignment_score is: '$second_best'\n\n"; + + # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether + if ($alignment_score == $second_best){ + $alignment_ambiguous = 1; + ## need to read and discard all additional ambiguous reads until we reach the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + last; # break free in case we have reached the end of the alignment output + } + } + # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + } + else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment + + my $alignment_location = join (":",$chromosome,$position); + + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id; + $alignments{$alignment_location}->{alignment_score} = $alignment_score; + $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome; + $alignments{$alignment_location}->{position} = $position; + $alignments{$alignment_location}->{CIGAR} = $cigar; + $alignments{$alignment_location}->{MD_tag} = $MD_tag; + } + + ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + last; # break free in case we have reached the end of the alignment output + } + } + # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + } + } + else{ # there is no second best hit, so we can just store this one and read in the next sequence + + my $alignment_location = join (":",$chromosome,$position); + + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id; + $alignments{$alignment_location}->{alignment_score} = $alignment_score; + $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome; + $alignments{$alignment_location}->{position} = $position; + $alignments{$alignment_location}->{MD_tag} = $MD_tag; + $alignments{$alignment_location}->{CIGAR} = $cigar; + } + + my $newline = $fhs[$index]->{fh}-> getline(); + if ($newline){ + chomp $newline; + my ($seq_id) = split (/\t/,$newline); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + if ($seq_id eq $identifier){ + die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + } + } + } + } + + ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out. + if ($alignment_ambiguous == 1){ + $counting{unsuitable_sequence_count}++; + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); + # print "$ambiguous_read_output\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; + } + } + + ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file + unless(%alignments){ + $counting{no_single_alignment_found}++; + # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value); + # print "$unmapped_read_output\n"; + if ($unmapped){ + return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified + } + else{ + return 0; # default + } + } + + ####################################################################################################################################################### + + ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one + ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) + ### alignment score we are discarding the sequence altogether. + ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for + ### opening (5) and extending (3 per bp) the gap. + + ####################################################################################################################################################### + + my $methylation_call_params; # hash reference which will store all information we need for the methylation call + my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + + ### print contents of %alignments for debugging + # if (scalar keys %alignments > 1){ + # print "\n******\n"; + # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ + # print "Loc: $alignment_location\n"; + # print "ID: $alignments{$alignment_location}->{seq_id}\n"; + # print "AS: $alignments{$alignment_location}->{alignment_score}\n"; + # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n"; + # print "Index $alignments{$alignment_location}->{index}\n"; + # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; + # print "pos: $alignments{$alignment_location}->{position}\n"; + # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n"; + # } + # print "\n******\n"; + # } + + ### if there is only 1 entry in the hash with we accept it as the best alignment + if (scalar keys %alignments == 1){ + for my $unique_best_alignment (keys %alignments){ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position}; + $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score}; + $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag}; + $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR}; + } + } + + ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case + ### we boot the sequence altogether + elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ + my $best_alignment_score; + my $best_alignment_location; + foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){ + # print "$alignments{$alignment_location}->{alignment_score}\n"; + unless (defined $best_alignment_score){ + $best_alignment_score = $alignments{$alignment_location}->{alignment_score}; + $best_alignment_location = $alignment_location; + # print "setting best alignment score: $best_alignment_score\n"; + } + else{ + ### if the second best alignment has the same alignment score as the first one, the sequence will get booted + if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){ + # warn "Same alignment score, the sequence will get booted!\n"; + $sequence_fails = 1; + last; # exiting after the second alignment since we know that the sequence has ambiguous alignments + } + ### else we are going to store the best alignment for further processing + else{ + $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; + $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position}; + $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; + $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score}; + $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag}; + $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR}; + last; # exiting after processing the second alignment since the sequence produced a unique best alignment + } + } + } + } + else{ + die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";; + } + + ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions + if ($sequence_fails == 1){ + $counting{unsuitable_sequence_count}++; + + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); + # print OUT "$ambiguous_read_output\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it has a unique best alignment + $counting{unique_best_alignment_count}++; + + ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well + extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params); + + ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); + print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value); + return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out +} + + +sub determine_number_of_transliterations_performed{ + my ($sequence,$read_conversion) = @_; + my $number_of_transliterations; + if ($read_conversion eq 'CT'){ + $number_of_transliterations = $sequence =~ tr/C/T/; + } + elsif ($read_conversion eq 'GA'){ + $number_of_transliterations = $sequence =~ tr/G/A/; + } + else{ + die "Read conversion mode of the read was not specified $!\n"; + } + return $number_of_transliterations; +} + +sub decide_whether_single_end_alignment_is_valid{ + my ($index,$identifier) = @_; + + # extracting from Bowtie 1 format + my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1]; + + ### ensuring that the entry is the correct sequence + if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){ + ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically + ### sensible alignments + my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); + ### If the orientation was correct can we move on + if ($orientation == 1){ + return 1; ### 1st possibility for a sequence to pass + } + ### If the alignment was in the wrong orientation we need to read in a new line + elsif($orientation == 0){ + my $newline = $fhs[$index]->{fh}->getline(); + if ($newline){ + ($id,$strand) = (split (/\t/,$newline))[0,1]; + + ### ensuring that the next entry is still the correct sequence + if ($id eq $identifier){ + ### checking orientation again + $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); + ### If the orientation was correct can we move on + if ($orientation == 1){ + $fhs[$index]->{last_seq_id} = $id; + $fhs[$index]->{last_line} = $newline; + return 1; ### 2nd possibility for a sequence to pass + } + ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs + elsif ($orientation == 0){ + $newline = $fhs[$index]->{fh}->getline(); + if ($newline){ + my ($seq_id) = split (/\t/,$newline); + ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with + ### the same fields of the just read next entry + die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier); + $fhs[$index]->{last_seq_id} = $seq_id; + $fhs[$index]->{last_line} = $newline; + return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation + } + else{ + # assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs + else{ + $fhs[$index]->{last_seq_id} = $id; + $fhs[$index]->{last_line} = $newline; + return 0; # processing the new alignment result only in the next round + } + } + else { + # assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line} = undef; + return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round + else{ + return 0; + } +} +######################### +### BOWTIE 1 | PAIRED-END +######################### + +sub check_bowtie_results_paired_ends{ + my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; + + ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 + unless ($quality_value_1){ + $quality_value_1 = 'I'x(length$sequence_1); + } + unless ($quality_value_2){ + $quality_value_2 = 'I'x(length$sequence_2); + } + + # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; + # sleep (1); + my %mismatches = (); + ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome + + + ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. + ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). + ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) + ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary + ### strands are not being reported by specifying --directional + + foreach my $index (0,3,1,2){ + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); + ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n"; + + ################################################################################## + ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ### + ################################################################################## + my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); + ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong + ### orientation. We only continue to extract useful information about this alignment if 1 was returned + if ($valid_alignment_found == 1){ + ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself. + ### we store the useful information in %mismatches + my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; + my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + + ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted + my ($chromosome_1,$chromosome_2); + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + ### Now extracting the number of mismatches to the converted genome + my $number_of_mismatches_1; + my $number_of_mismatches_2; + if ($mismatch_info_1 eq ''){ + $number_of_mismatches_1 = 0; + } + elsif ($mismatch_info_1 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_1); + $number_of_mismatches_1 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + if ($mismatch_info_2 eq ''){ + $number_of_mismatches_2 = 0; + } + elsif ($mismatch_info_2 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_2); + $number_of_mismatches_2 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments + my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + die "Position 1 is higher than position 2" if ($position_1 > $position_2); + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ + $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; + } + ################################################################################################################################################### + ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ### + ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ### + ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ### + ### this round ### + ################################################################################################################################################### + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_2; + } + else{ + die "Either read 1 or read 2 needs to end on '/1'\n"; + } + + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else { + # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; # jumping to the next index + } + ### Now processing the entry we just stored in last_line_1 and last_line_2 + $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); + ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to + ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation + if ($valid_alignment_found == 1){ + ### we store the useful information in %mismatches + ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7]; + ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted) + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + $number_of_mismatches_1=''; + $number_of_mismatches_2=''; + ### Now extracting the number of mismatches to the converted genome + if ($mismatch_info_1 eq ''){ + $number_of_mismatches_1 = 0; + } + elsif ($mismatch_info_1 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_1); + $number_of_mismatches_1 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + if ($mismatch_info_2 eq ''){ + $number_of_mismatches_2 = 0; + } + elsif ($mismatch_info_2 =~ /^\d/){ + my @mismatches = split (/,/,$mismatch_info_2); + $number_of_mismatches_2 = scalar @mismatches; + } + else{ + die "Something weird is going on with the mismatch field\n"; + } + ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments + $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; + ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table + die "position 1 is greater than position 2" if ($position_1 > $position_2); + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same + ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index + ### number for the found alignment) + unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ + $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; + $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; + } + ############################################################################################################################################### + ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ### + ############################################################################################################################################### + $newline_1 = $fhs[$index]->{fh}-> getline(); + $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_1; + } + if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $fhs[$index]->{last_seq_id} = $seq_id_2; + } + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else { + # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; # jumping to the next index + } + ### within the 2nd sequence pair alignment in correct orientation found + } + ### within the 1st sequence pair alignment in correct orientation found + } + ### still within the (last_seq_id eq identifier) condition + } + ### still within foreach index loop + } + ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file + unless(%mismatches){ + $counting{no_single_alignment_found}++; + return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified + } + ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + my $sequence_pair_fails = 0; + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference! + ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the + ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the + ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether + foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ + #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n"; + foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){ + #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n"; + # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n"; + } + if (scalar keys %{$mismatches{$mismatch_number}} == 1){ + # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n"; + for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ + $methylation_call_params->{$identifier}->{seq_id} = $identifier; + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1}; + $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}; + $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2})); + $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1}; + $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2}; + } + } + else{ + $sequence_pair_fails = 1; + } + ### after processing the alignment with the lowest number of mismatches we exit + last; + } + ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions + if ($sequence_pair_fails == 1){ + $counting{unsuitable_sequence_count}++; + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified + } + if ($unmapped){ + return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified + } + else{ + return 0; # => exits to next sequence (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence has not been rejected so far it does have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params); + + ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### otherwise we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); + $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); + + print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); + return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 +} + +######################### +### BOWTIE 2 | PAIRED-END +######################### + +sub check_bowtie_results_paired_ends_bowtie2{ + my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; + + ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 + unless ($quality_value_1){ + $quality_value_1 = 'I'x(length$sequence_1); + } + + unless ($quality_value_2){ + $quality_value_2 = 'I'x(length$sequence_2); + } + + + # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; + + + my %alignments; + my $alignment_ambiguous = 0; + + ### reading from the Bowtie 2 output filehandles + + ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. + ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). + ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) + ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary + ### strands are not being reported when '--directional' is specified + + foreach my $index (0,3,1,2){ + ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) + next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); + + ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it + if ($fhs[$index]->{last_seq_id} eq $identifier) { + + my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10]; + my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10]; + # print "Index: $index\t$fhs[$index]->{last_line_1}\n"; + # print "Index: $index\t$fhs[$index]->{last_line_2}\n"; + # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n"; + # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n"; + $id_1 =~ s/\/1$//; + $id_2 =~ s/\/2$//; + + # SAM format specifications for Bowtie 2 + # (1) Name of read that aligned + # (2) Sum of all applicable flags. Flags relevant to Bowtie are: + # 1 The read is one of a pair + # 2 The alignment is one end of a proper paired-end alignment + # 4 The read has no reported alignments + # 8 The read is one of a pair and has no reported alignments + # 16 The alignment is to the reverse reference strand + # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand + # 64 The read is mate 1 in a pair + # 128 The read is mate 2 in a pair + # 256 The read has multiple mapping states + # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) + # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) + # (5) Mapping quality (255 means MAPQ is not available) + # (6) CIGAR string representation of alignment (* if unavailable) + # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. + # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. + # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. + # (10) Read sequence (reverse-complemented if aligned to the reverse strand) + # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. + # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: + # AS:i: Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. + # XS:i: Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. + # YS:i: Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. + # XN:i: The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. + # XM:i: The number of mismatches in the alignment. Only present if SAM record is for an aligned read. + # XO:i: The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # XG:i: The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. + # NM:i: The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. + # YF:Z: String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. + # MD:Z: A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. + + ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128). + ### We can store the next alignment and move on to the next Bowtie 2 instance + if ($flag_1 == 77 and $flag_2 == 141){ + ## reading in the next alignment, which must be the next sequence + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + + # print "current sequence ($identifier) did not map, reading in next sequence\n"; + # print "$index\t$fhs[$index]->{last_seq_id}\n"; + # print "$index\t$fhs[$index]->{last_line_1}\n"; + # print "$index\t$fhs[$index]->{last_line_2}\n"; + next; # next instance + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + next; + } + } + + ### If there are one or more proper alignments we can extract the chromosome number + my ($chromosome_1,$chromosome_2); + if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ + $chromosome_1 = $mapped_chromosome_1; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_1\n"; + } + if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ + $chromosome_2 = $mapped_chromosome_2; + } + else{ + die "Chromosome number extraction failed for $mapped_chromosome_2\n"; + } + + die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); + + ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string + my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2); + + my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1}); + my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2}); + + foreach (11..$#fields_1){ + if ($fields_1[$_] =~ /AS:i:(.*)/){ + $alignment_score_1 = $1; + } + elsif ($fields_1[$_] =~ /XS:i:(.*)/){ + $second_best_1 = $1; + } + elsif ($fields_1[$_] =~ /MD:Z:(.*)/){ + $MD_tag_1 = $1; + } + } + + foreach (11..$#fields_2){ + if ($fields_2[$_] =~ /AS:i:(.*)/){ + $alignment_score_2 = $1; + } + elsif ($fields_2[$_] =~ /XS:i:(.*)/){ + $second_best_2 = $1; + } + elsif ($fields_2[$_] =~ /MD:Z:(.*)/){ + $MD_tag_2 = $1; + } + } + + die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1); + die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2); + + # warn "First read 1 alignment score is: '$alignment_score_1'\n"; + # warn "First read 2 alignment score is: '$alignment_score_2'\n"; + # warn "MD tag 1 is: '$MD_tag_1'\n"; + # warn "MD tag 2 is: '$MD_tag_2'\n"; + + ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments + my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ; + # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n"; + + if (defined $second_best_1 and defined $second_best_2){ + my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2; + # warn "Second best alignment_score_1 is: '$second_best_1'\n"; + # warn "Second best alignment_score_2 is: '$second_best_2'\n"; + # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n"; + + # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether + if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){ + $alignment_ambiguous = 1; + # print "This read will be chucked (AS==XS detected)!\n"; + + ## need to read and discard all additional ambiguous reads until we reach the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else{ + # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + last; # break free if the end of the alignment output was reached + } + } + # if ($fhs[$index]->{last_seq_id}){ + # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; + # } + } + else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment + + my $alignment_location; + if ($position_1 <= $position_2){ + $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + } + elsif($position_2 < $position_1){ + $alignment_location = join(":",$chromosome_1,$position_2,$position_1); + } + + ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id_1; + $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; + $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; + $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; + $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; + $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine + $alignments{$alignment_location}->{position_1} = $position_1; + $alignments{$alignment_location}->{position_2} = $position_2; + $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; + $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; + $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; + $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; + $alignments{$alignment_location}->{flag_1} = $flag_1; + $alignments{$alignment_location}->{flag_2} = $flag_2; + } + # warn "added best of several alignments to \%alignments hash\n"; + + ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence + until ($fhs[$index]->{last_seq_id} ne $identifier){ + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + } + else{ + # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + last; # break free if the end of the alignment output was reached + } + } + # if($fhs[$index]->{last_seq_id}){ + # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n"; + # } + } + } + else{ # there is no second best hit, so we can just store this one and read in the next sequence + + my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); + # print "$alignment_location\n"; + ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse + ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite + ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only + ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB + + unless (exists $alignments{$alignment_location}){ + $alignments{$alignment_location}->{seq_id} = $id_1; + $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; + $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; + $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; + $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; + $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; + $alignments{$alignment_location}->{index} = $index; + $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine + $alignments{$alignment_location}->{position_1} = $position_1; + $alignments{$alignment_location}->{position_2} = $position_2; + $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; + $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; + $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; + $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; + $alignments{$alignment_location}->{flag_1} = $flag_1; + $alignments{$alignment_location}->{flag_2} = $flag_2; + } + + # warn "added unique alignment to \%alignments hash\n"; + + # Now reading and storing the next read pair + my $newline_1 = $fhs[$index]->{fh}-> getline(); + my $newline_2 = $fhs[$index]->{fh}-> getline(); + if ($newline_1 and $newline_2){ + chomp $newline_1; + chomp $newline_2; + # print "$newline_1\n"; + # print "$newline_2\n"; + my ($seq_id_1) = split (/\t/,$newline_1); + my ($seq_id_2) = split (/\t/,$newline_2); + $seq_id_1 =~ s/\/1$//; + $seq_id_2 =~ s/\/2$//; + # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; + + $fhs[$index]->{last_seq_id} = $seq_id_1; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + + if ($seq_id_1 eq $identifier){ + die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; + } + } + else{ + # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + } + } + } + } + + ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format + if ($alignment_ambiguous == 1){ + $counting{unsuitable_sequence_count}++; + ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$ambiguous_read_1\n"; + # print "$ambiguous_read_2\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; + } + } + + ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file + unless (%alignments){ + $counting{no_single_alignment_found}++; + + # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$unmapped_read_1\n"; + # print "$unmapped_read_2\n"; + if ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified + } + else{ + return 0; + } + } + + ####################################################################################################################################################### + + ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one + ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) + ### alignment score we are discarding the sequence pair altogether. + ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5) + ### and extending (3 per bp) the gap. + + ####################################################################################################################################################### + + ### Declaring an empty hash reference which will store all information we need for the methylation call + my $methylation_call_params; # hash reference + my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) + + ### print contents of %alignments for debugging + ## if (scalar keys %alignments >= 1){ + # print "\n******\n"; + # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ + # print "Loc: $alignment_location\n"; + # print "ID: $alignments{$alignment_location}->{seq_id}\n"; + # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n"; + # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n"; + # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n"; + # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n"; + # print "Index $alignments{$alignment_location}->{index}\n"; + # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; + # print "Pos_1: $alignments{$alignment_location}->{position_1}\n"; + # print "Pos_2: $alignments{$alignment_location}->{position_2}\n"; + # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n"; + # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n"; + # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n"; + # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n"; + # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n"; + # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n"; + # } + # print "\n******\n"; + # } + + ### if there is only 1 entry in the %alignments hash we accept it as the best alignment + if (scalar keys %alignments == 1){ + for my $unique_best_alignment (keys %alignments){ + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; + $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1}; + $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2}; + $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; + $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1}; + $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores}; + $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1}; + $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2}; + $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1}; + $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2}; + $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1}; + $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2}; + } + } + + ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case + ### we boot the sequence pair altogether) + elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ + my $best_sum_of_alignment_scores; + my $best_alignment_location; + foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){ + # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; + unless (defined $best_sum_of_alignment_scores){ + $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores}; + $best_alignment_location = $alignment_location; + # print "setting best alignment score to: $best_sum_of_alignment_scores\n"; + } + else{ + ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted + if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){ + # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n"; + $sequence_pair_fails = 1; + last; # exiting since we know that the sequence has ambiguous alignments + } + ### else we are going to store the best alignment for further processing + else{ + $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1}; + $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2}; + $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; + $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1}; + $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2}; + $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; + $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1}; + $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2}; + $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores}; + $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1}; + $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2}; + $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1}; + $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2}; + $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1}; + $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2}; + last; # exiting since the sequence produced a unique best alignment + } + } + } + } + else{ + die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";; + } + + ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions + if ($sequence_pair_fails == 1){ + $counting{unsuitable_sequence_count}++; + + ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else + # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); + # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); + # print "$ambiguous_read_1\n"; + # print "$ambiguous_read_2\n"; + + if ($ambiguous){ + return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified + } + elsif ($unmapped){ + return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified + } + else{ + return 0; # => exits to next sequence pair (default) + } + } + + ### --DIRECTIONAL + ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore + ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol + if ($directional){ + if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ + # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; + $counting{alignments_rejected_count}++; + return 0; + } + } + + ### If the sequence pair has not been rejected so far it does have a unique best alignment + $counting{unique_best_alignment_count}++; + extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); + + ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; + $counting{genomic_sequence_could_not_be_extracted_count}++; + return 0; + } + + ### now we are set to perform the actual methylation call + $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); + $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); + # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n"; + # print " $sequence_2\n"; + # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n"; + # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n"; + + print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); + return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 +} + +### + +sub decide_whether_paired_end_alignment_is_valid{ + my ($index,$identifier) = @_; + my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; + my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; + chomp $mismatch_info_1; + chomp $mismatch_info_2; + my $seq_id_1 = $id_1; + my $seq_id_2 = $id_2; + $seq_id_1 =~ s/\/1$//; # removing the read /1 + $seq_id_2 =~ s/\/1$//; # removing the read /1 + + ### ensuring that the current entry is the correct sequence + if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ + ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically + ### sensible alignments + my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); + ### If the orientation was correct can we move on + if ($orientation == 1){ + return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS + } + ### If the alignment was in the wrong orientation we need to read in two new lines + elsif($orientation == 0){ + my $newline_1 = $fhs[$index]->{fh}->getline(); + my $newline_2 = $fhs[$index]->{fh}->getline(); + if ($newline_1 and $newline_2){ + ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time) + ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1]; + ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1]; + + my $seqid; + $seq_id_1 = $id_1; + $seq_id_2 = $id_2; + # we need to capture the first read (ending on /1) + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_2; + } + else{ + die "One of the two reads needs to end on /1!!"; + } + + ### ensuring that the next entry is still the correct sequence + if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ + ### checking orientation again + $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); + ### If the orientation was correct can we move on + if ($orientation == 1){ + ### Writing the current sequence to last_line_1 and last_line_2 + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS + } + ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be + ### the next entry) + elsif ($orientation == 0){ + $newline_1 = $fhs[$index]->{fh}->getline(); + $newline_2 = $fhs[$index]->{fh}->getline(); + if ($newline_1 and $newline_2){ + ($seq_id_1) = split (/\t/,$newline_1); + ($seq_id_2) = split (/\t/,$newline_2); + + $seqid = ''; + if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_1; + } + elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag + $seqid = $seq_id_2; + } + else{ + die "One of the two reads needs to end on /1!!"; + } + + ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with + ### the same fields of the just read next entry + die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier); + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + else { + ### assigning undef to last_seq_id and last_line (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs + else{ + $fhs[$index]->{last_seq_id} = $seqid; + $fhs[$index]->{last_line_1} = $newline_1; + $fhs[$index]->{last_line_2} = $newline_2; + return 0; # processing the new alignment result only in the next round + } + } + else { + # assigning undef to last_seq_id and both last_lines (end of bowtie output) + $fhs[$index]->{last_seq_id} = undef; + $fhs[$index]->{last_line_1} = undef; + $fhs[$index]->{last_line_2} = undef; + return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation + } + } + else{ + die "The orientation of the alignment must be either correct or incorrect\n"; + } + } + ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round + else{ + return 0; + } +} + +### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END + +sub extract_corresponding_genomic_sequence_paired_ends { + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + my $alignment_read_1; + my $alignment_read_2; + my $read_conversion_info_1; + my $read_conversion_info_2; + my $genome_conversion; + + ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call + ### if the C happens to be at the first or last position of the actually observed sequence + my $non_bisulfite_sequence_1; + my $non_bisulfite_sequence_2; + + ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was + ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic + ### sequences around! + ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_GA_CT_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'CT'; + ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) + ### for hits on the forward strand we need to capture 2 extra bases at the 3' end + + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) + ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1 + + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + else{ + $non_bisulfite_sequence_2 = ''; + } + } + + ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from complementary to (converted) reverse strand] + $counting{GA_CT_GA_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'GA'; + + ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) + ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end + if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1 + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2 + } + else{ + $non_bisulfite_sequence_1 = ''; + } + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) + ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + + ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from the complementary to (converted) forward strand] + $counting{GA_CT_CT_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'CT'; + + ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! + ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand + ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) + ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! + ### Read 2 is CT converted so we need to capture 2 extra 3' bases + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1 + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2 + } + else{ + $non_bisulfite_sequence_2 = ''; + } + } + + ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from the (converted) reverse strand] + $counting{CT_GA_GA_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'GA'; + + ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! + ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand + ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation + if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1 + $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2 + ### the reverse strand sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + } + else{ + $non_bisulfite_sequence_1 = ''; + } + + ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) + ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! + ### Read 2 is GA converted so we need to capture 2 extra 5' bases + $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2 + } + else{ + die "Too many bowtie result filehandles\n"; + } + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + + $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; + $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; + $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; +} + +### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END + +sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{ + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; + my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; + my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; + my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; + # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; + # sleep(10); + ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_read_1; + my $alignment_read_2; + my $read_conversion_info_1; + my $read_conversion_info_2; + my $genome_conversion; + + ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call + ### if the C happens to be at the last position of the actually observed sequence + my $non_bisulfite_sequence_1 = ''; + my $non_bisulfite_sequence_2 = ''; + + ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings + my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1; + my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1; + + # parsing CIGAR 1 string + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); + # parsing CIGAR 2 string + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + + my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output + my $indels_2 = 0; + + ### Extracting read 1 genomic sequence ### + + # extracting 2 additional bp at the 5' end (read 1) + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + return; + } + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2); + } + + foreach (0..$#len_1){ + if ($ops_1[$_] eq 'M'){ + # extracting genomic sequence + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); + # warn "$non_bisulfite_sequence_1\n"; + # adjusting position + $pos_1 += $len_1[$_]; + } + elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls + $non_bisulfite_sequence_1 .= 'N' x $len_1[$_]; + # warn "$non_bisulfite_sequence_1\n"; + # position doesn't need adjusting + $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + # warn "Just adjusting the position by: ",$len_1[$_],"bp\n"; + $pos_1 += $len_1[$_]; + $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + else{ + die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + } + + ### 3' end of read 1 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + return; + } + + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); + } + + + ### Extracting read 2 genomic sequence ### + + ### 5' end of read 2 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + return; + } + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2); + } + + foreach (0..$#len_2){ + if ($ops_2[$_] eq 'M'){ + # extracting genomic sequence + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); + # warn "$non_bisulfite_sequence_2\n"; + # adjusting position + $pos_2 += $len_2[$_]; + } + elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls + $non_bisulfite_sequence_2 .= 'N' x $len_2[$_]; + # warn "$non_bisulfite_sequence_2\n"; + # position doesn't need adjusting + $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + # warn "Just adjusting the position by: ",$len_2[$_],"bp\n"; + $pos_2 += $len_2[$_]; + $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output + } + elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + else{ + die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + } + + ### 3' end of read 2 + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise + # need to set read 1 as well now to prevent warning + # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; + # sleep(5); + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + return; + } + $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); + } + + ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was + ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly + + ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_GA_CT_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'CT'; + ### Read 1 is always the forward hit + ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + + ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from complementary to (converted) bottom strand] + $counting{GA_CT_GA_count}++; + $alignment_read_1 = '+'; + $alignment_read_2 = '-'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'GA'; + ### Read 1 is always the forward hit + ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented + $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); + } + + ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from the complementary to (converted) top strand] + $counting{GA_CT_CT_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'GA'; + $read_conversion_info_2 = 'CT'; + $genome_conversion = 'CT'; + + ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + } + + ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from the (converted) reverse strand] + $counting{CT_GA_GA_count}++; + $alignment_read_1 = '-'; + $alignment_read_2 = '+'; + $read_conversion_info_1 = 'CT'; + $read_conversion_info_2 = 'GA'; + $genome_conversion = 'GA'; + ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented + $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); + } + else{ + die "Too many bowtie result filehandles\n"; + } + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + + $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; + $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; + $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; + ## the end position of a read is stored in $pos + $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1; + $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2; + $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1; + $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2; +} + +########################################## +### PRINT SINGLE END RESULTS: Bowtie 1 ### +########################################## + +sub print_bisulfite_mapping_result_single_end{ + my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value = convert_phred64_quals_to_phred33($quality_value); + } + elsif ($solexa){ + $quality_value = convert_solexa_quals_to_phred33($quality_value); + } + + ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position. + $methylation_call_params->{$identifier}->{position} += 1; + + ### writing every uniquely mapped read and its methylation call to the output file + if ($vanilla){ + my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value); + print OUT "$bowtie1_output\n"; + } + else{ # SAM output, default since Bismark v1.0.0 + single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script + } +} + +########################################## +### PRINT SINGLE END RESULTS: Bowtie 2 ### +########################################## + +sub print_bisulfite_mapping_result_single_end_bowtie2{ + my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value = convert_phred64_quals_to_phred33($quality_value); + } + elsif ($solexa){ + $quality_value = convert_solexa_quals_to_phred33($quality_value); + } + + ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed) + single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script +} + +########################################## +### PRINT PAIRED END ESULTS: Bowtie 1 ### +########################################## + +sub print_bisulfite_mapping_results_paired_ends{ + my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); + } + elsif ($solexa){ + $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); + } + + ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based) + $methylation_call_params->{$identifier}->{start_seq_1} += 1; + + ### writing every single aligned read and its methylation call to the output file + if ($vanilla){ + my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2); + print OUT "$bowtie1_output_paired_end\n"; + } + else{ # SAM output, default since Bismark v1.0.0 + paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script + } + +} + +########################################## +### PRINT PAIRED END ESULTS: Bowtie 2 ### +########################################## + +sub print_bisulfite_mapping_results_paired_ends_bowtie2{ + my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; + + ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) + if ($phred64){ + $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); + } + elsif ($solexa){ + $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); + $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); + } + + ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed) + paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script + +} + + +sub convert_phred64_quals_to_phred33{ + + my $qual = shift; + my @quals = split (//,$qual); + my @new_quals; + + foreach my $index (0..$#quals){ + my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]); + my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); + $new_quals[$index] = $phred33_quality_string; + } + + my $phred33_quality = join ("",@new_quals); + return $phred33_quality; +} + +sub convert_solexa_quals_to_phred33{ + + my $qual = shift; + my @quals = split (//,$qual); + my @new_quals; + + foreach my $index (0..$#quals){ + my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]); + my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); + $new_quals[$index] = $phred33_quality_string; + } + + my $phred33_quality = join ("",@new_quals); + return $phred33_quality; +} + +sub convert_phred_score_into_phred33_quality_string{ + my $qual = shift; + $qual = chr($qual+33); + return $qual; +} + +sub convert_phred64_quality_string_into_phred_score{ + my $string = shift; + my $qual = ord($string)-64; + return $qual; +} + +sub convert_solexa_pre1_3_quality_string_into_phred_score{ + ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10 + my $string = shift; + my $qual = ord($string)-59; + return $qual; +} + + +sub extract_corresponding_genomic_sequence_single_end { + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_strand; + my $read_conversion_info; + my $genome_conversion; + ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, + ### if the C happens to be at the last position of the actually observed sequence + my $non_bisulfite_sequence; + ### depending on the conversion we want to make need to capture 1 extra base at the 3' end + + ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_CT_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'CT'; + $genome_conversion = 'CT'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 + ### + 2 extra base at the 3' end + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from (converted) reverse strand] + $counting{CT_GA_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'CT'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > + ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from complementary to (converted) forward strand] + $counting{GA_CT_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'GA'; + $genome_conversion = 'CT'; + + ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from complementary to (converted) reverse strand] + $counting{GA_GA_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'GA'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > + ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + else{ + die "Too many bowtie result filehandles\n"; + } + + $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; + $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + + ### at this point we can also determine the end position of a read + $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); +} + +sub extract_corresponding_genomic_sequence_single_end_pbat { + my ($sequence_identifier,$methylation_call_params) = @_; + ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_strand; + my $read_conversion_info; + my $genome_conversion; + ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, + ### if the C happens to be at the last position of the actually observed sequence + my $non_bisulfite_sequence; + ### depending on the conversion we want to make need to capture 1 extra base at the 3' end + + my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1! + + ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) + if ($pbat_index == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_CT_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'CT'; + $genome_conversion = 'CT'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 + ### + 2 extra base at the 3' end + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) + elsif ($pbat_index == 1){ + ### [Index 1, sequence originated from (converted) reverse strand] + $counting{CT_GA_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'CT'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > + ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) + elsif ($pbat_index == 2){ + ### [Index 2, sequence originated from complementary to (converted) forward strand] + $counting{GA_CT_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'GA'; + $genome_conversion = 'CT'; + + ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 + ## reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + else{ + $non_bisulfite_sequence = ''; + } + } + + ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) + elsif ($pbat_index == 3){ + ### [Index 3, sequence originated from complementary to (converted) reverse strand] + $counting{GA_GA_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'GA'; + $genome_conversion = 'GA'; + + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > + ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand + $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 + } + else{ + $non_bisulfite_sequence = ''; + } + } + else{ + die "Too many bowtie result filehandles\n"; + } + + $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; + $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + + ### at this point we can also determine the end position of a read + $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); +} + + +sub extract_corresponding_genomic_sequence_single_end_bowtie2{ + my ($sequence_identifier,$methylation_call_params) = @_; + + my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info}; + my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR}; + + ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the + ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call + + ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, + ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions + my $alignment_strand; + my $read_conversion_info; + my $genome_conversion; + ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and + ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence + my $non_bisulfite_sequence = ''; + + ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings + my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3) + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + return; + } + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2); + } + my $indels = 0; + + foreach (0..$#len){ + if ($ops[$_] eq 'M'){ + #extracting genomic sequence + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); + # adjusting position + $pos += $len[$_]; + } + elsif ($ops[$_] eq 'I'){ # insertion in the read sequence + # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls + $non_bisulfite_sequence .= 'N' x $len[$_]; + # warn "$non_bisulfite_sequence\n"; + # position doesn't need to be adjusting + $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) + } + elsif ($ops[$_] eq 'D'){ # deletion in the read sequence + # we do not add any genomic sequence but only adjust the position + $pos += $len[$_]; + $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) + } + elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + else{ + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + } + + ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2) + if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ + ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome + unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + return; + } + $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2); + # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n"; + } + + + + ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) + if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ + ### [Index 0, sequence originated from (converted) forward strand] + $counting{CT_CT_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'CT'; + $genome_conversion = 'CT'; + } + + ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ + ### [Index 1, sequence originated from (converted) reverse strand] + $counting{CT_GA_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'CT'; + $genome_conversion = 'GA'; + + ### reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + + ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ + ### [Index 2, sequence originated from complementary to (converted) forward strand] + $counting{GA_CT_count}++; + $alignment_strand = '-'; + $read_conversion_info = 'GA'; + $genome_conversion = 'CT'; + + ### reverse complement! + $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); + } + + ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) + elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ + ### [Index 3, sequence originated from complementary to (converted) reverse strand] + $counting{GA_GA_count}++; + $alignment_strand = '+'; + $read_conversion_info = 'GA'; + $genome_conversion = 'GA'; + + } + else{ + die "Too many Bowtie 2 result filehandles\n"; + } + + $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; + $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; + $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; + + ### the end position of a read is stored in $pos + $methylation_call_params->{$sequence_identifier}->{end_position} = $pos; + $methylation_call_params->{$sequence_identifier}->{indels} = $indels; +} + +### METHYLATION CALL + +sub methylation_call{ + my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_; + ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one + my @seq = split(//,$sequence_actually_observed); + my @genomic = split(//,$genomic_sequence); + # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n"; + ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either + ### CpG, CHH or CHG context + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ### U for methylated C in unknown context (was protected) ### + ### u for not methylated C in unknwon context (was converted) ### + ################################################################# + + my @match =(); + warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 + my $methyl_CHH_count = 0; + my $methyl_CHG_count = 0; + my $methyl_CpG_count = 0; + my $methyl_C_unknown_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CpG_count = 0; + my $unmethylated_C_unknown_count = 0; + + if ($read_conversion eq 'CT'){ + for my $index (0..$#seq) { + if ($seq[$index] eq $genomic[$index]) { + ### The residue can only be a C if it was not converted to T, i.e. protected my methylation + if ($genomic[$index] eq 'C') { + ### If the residue is a C we want to know if it was in CpG context or in any other context + my $downstream_base = $genomic[$index+1]; + + if ($downstream_base eq 'G'){ + ++$methyl_CpG_count; + push @match,'Z'; # protected C, methylated, in CpG context + } + elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$methyl_C_unknown_count; + push @match,'U'; # protected C, methylated, in Unknown context + } + else { + ### C in not in CpG-context, determining the second downstream base context + my $second_downstream_base = $genomic[$index+2]; + + if ($second_downstream_base eq 'G'){ + ++$methyl_CHG_count; + push @match,'X'; # protected C, methylated, in CHG context + } + elsif ($second_downstream_base eq 'N'){ + ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'U'; # protected C, methylated, in Unknown context + } + else{ + ++$methyl_CHH_count; + push @match,'H'; # protected C, methylated, in CHH context + } + } + } + else { + push @match, '.'; + } + } + elsif ($seq[$index] ne $genomic[$index]) { + ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts + ### in the actually observed sequence + if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') { + ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context + my $downstream_base = $genomic[$index+1]; + + if ($downstream_base eq 'G'){ + ++$unmethylated_CpG_count; + push @match,'z'; # converted C, not methylated, in CpG context + } + elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$unmethylated_C_unknown_count; + push @match,'u'; # converted C, not methylated, in Unknown context + } + else{ + ### C in not in CpG-context, determining the second downstream base context + my $second_downstream_base = $genomic[$index+2]; + + if ($second_downstream_base eq 'G'){ + ++$unmethylated_CHG_count; + push @match,'x'; # converted C, not methylated, in CHG context + } + elsif ($second_downstream_base eq 'N'){ + ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'u'; # converted C, not methylated, in Unknown context + } + else{ + ++$unmethylated_CHH_count; + push @match,'h'; # converted C, not methylated, in CHH context + } + } + } + ### all other mismatches are not of interest for a methylation call + else { + push @match,'.'; + } + } + else{ + die "There can be only 2 possibilities\n"; + } + } + } + elsif ($read_conversion eq 'GA'){ + # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n"; + + for my $index (0..$#seq) { + if ($seq[$index] eq $genomic[$index+2]) { + ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation + if ($genomic[$index+2] eq 'G') { + ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need + ### to look if the base upstream is a C + + my $upstream_base = $genomic[$index+1]; + + if ($upstream_base eq 'C'){ + ++$methyl_CpG_count; + push @match,'Z'; # protected C on opposing strand, methylated, in CpG context + } + elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$methyl_C_unknown_count; + push @match,'U'; # protected C on opposing strand, methylated, in Unknown context + } + else{ + ### C in not in CpG-context, determining the second upstream base context + my $second_upstream_base = $genomic[$index]; + + if ($second_upstream_base eq 'C'){ + ++$methyl_CHG_count; + push @match,'X'; # protected C on opposing strand, methylated, in CHG context + } + elsif ($second_upstream_base eq 'N'){ + ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'U'; # protected C, methylated, in Unknown context + } + else{ + ++$methyl_CHH_count; + push @match,'H'; # protected C on opposing strand, methylated, in CHH context + } + } + } + else{ + push @match, '.'; + } + } + elsif ($seq[$index] ne $genomic[$index+2]) { + ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts + ### on the opposing strand, so G to A conversions in the actually observed sequence + if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') { + ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if + ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream! + + my $upstream_base = $genomic[$index+1]; + + if ($upstream_base eq 'C'){ + ++$unmethylated_CpG_count; + push @match,'z'; # converted C on opposing strand, not methylated, in CpG context + } + elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$unmethylated_C_unknown_count; + push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context + } + else{ + ### C in not in CpG-context, determining the second upstream base context + my $second_upstream_base = $genomic[$index]; + + if ($second_upstream_base eq 'C'){ + ++$unmethylated_CHG_count; + push @match,'x'; # converted C on opposing strand, not methylated, in CHG context + } + elsif ($second_upstream_base eq 'N'){ + ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context + } + else{ + ++$unmethylated_CHH_count; + push @match,'h'; # converted C on opposing strand, not methylated, in CHH context + } + } + } + ### all other mismatches are not of interest for a methylation call + else { + push @match,'.'; + } + } + else{ + die "There can be only 2 possibilities\n"; + } + } + } + else{ + die "Strand conversion info is required to perform a methylation call\n"; + } + + my $methylation_call = join ("",@match); + + $counting{total_meCHH_count} += $methyl_CHH_count; + $counting{total_meCHG_count} += $methyl_CHG_count; + $counting{total_meCpG_count} += $methyl_CpG_count; + $counting{total_meC_unknown_count} += $methyl_C_unknown_count; + $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; + $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; + $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; + $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; + + # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; + return $methylation_call; +} + +sub read_genome_into_memory{ + ## working directoy + my $cwd = shift; + ## reading in and storing the specified genome in the %chromosomes hash + chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; + print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; + + my @chromosome_filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@chromosome_filenames){ + @chromosome_filenames = <*.fasta>; + } + + unless (@chromosome_filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; + } + + foreach my $chromosome_filename (@chromosome_filenames){ + + open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; + ### first line needs to be a fastA header + my $first_line = ; + chomp $first_line; + $first_line =~ s/\r//; + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + my $sequence; + while (){ + chomp; + $_ =~ s/\r//; + if ($_ =~ /^>/){ + ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) + if (exists $chromosomes{$chromosome_name}){ + print "chr $chromosome_name (",length $sequence ," bp)\n"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; + } + else { + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; + } + print "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + } + ### resetting the sequence variable + $sequence = ''; + ### setting new chromosome name + $chromosome_name = extract_chromosome_name($_); + } + else{ + $sequence .= uc$_; + } + } + + if (exists $chromosomes{$chromosome_name}){ + print "chr $chromosome_name (",length $sequence ," bp)\t"; + die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; + } + else{ + if (length($sequence) == 0){ + warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; + } + print "chr $chromosome_name (",length $sequence ," bp)\n"; + $chromosomes{$chromosome_name} = $sequence; + } + } + print "\n"; + chdir $cwd or die "Failed to move to directory $cwd\n"; +} + +sub extract_chromosome_name { + ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well + my $fasta_header = shift; + if ($fasta_header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$fasta_header); + return $chromosome_name; + } + else{ + die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; + } +} + +sub reverse_complement{ + my $sequence = shift; + $sequence =~ tr/CATG/GTAC/; + $sequence = reverse($sequence); + return $sequence; +} + +sub biTransformFastAFiles { + my $file = shift; + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + if ($gzip){ + $C_to_T_infile =~ s/$/_C_to_T.fa.gz/; + $G_to_A_infile =~ s/$/_G_to_A.fa.gz/; + } + else{ + $C_to_T_infile =~ s/$/_C_to_T.fa/; + $G_to_A_infile =~ s/$/_G_to_A.fa/; + } + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + + unless ($directional){ + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + + my $count = 0; + + while (1){ + my $header = ; + my $sequence= ; + last unless ($header and $sequence); + + $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($header,"\t") != -1){ + $seqID_contains_tabs++; + } + + ### small check if the sequence seems to be in FastA format + die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); + + my $sequence_C_to_T = $sequence; + $sequence_C_to_T =~ tr/C/T/; + print CTOT "$header$sequence_C_to_T"; + + unless ($directional){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA "$header$sequence_G_to_A"; + } + } + close CTOT or die "Failed to close filehandle $!\n"; + + if ($directional){ + warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + else{ + close GTOA or die "Failed to close filehandle $!\n"; + warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + return ($C_to_T_infile,$G_to_A_infile); +} + +sub biTransformFastAFiles_paired_end { + my ($file,$read_number) = @_; + + if ($gzip){ + warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n"; + sleep (2); + } + + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + $C_to_T_infile =~ s/$/_C_to_T.fa/; + $G_to_A_infile =~ s/$/_G_to_A.fa/; + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + elsif ($read_number == 2){ + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + else{ + die "Read number needs to be 1 or 2, but was: $read_number\n\n"; + } + } + else{ # all four strand output + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + + my $count = 0; + + while (1){ + my $header = ; + my $sequence= ; + last unless ($header and $sequence); + + $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($header,"\t") != -1){ + $seqID_contains_tabs++; + } + + ## small check if the sequence seems to be in FastA format + die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/); + + if ($read_number == 1){ + if ($bowtie2){ + $header =~ s/$/\/1\/1/; + } + else{ + $header =~ s/$/\/1/; + } + } + elsif ($read_number == 2){ + if ($bowtie2){ + $header =~ s/$/\/2\/2/; + } + else{ + $header =~ s/$/\/2/; + } + } + else{ + die "Read number needs to be 1 or 2, but was: $read_number\n\n"; + } + my $sequence_C_to_T = my $sequence_G_to_A = $sequence; + + $sequence_C_to_T =~ tr/C/T/; + $sequence_G_to_A =~ tr/G/A/; + + if ($directional){ + + if ($read_number == 1){ + print CTOT "$header$sequence_C_to_T"; + } + elsif ($read_number == 2){ + print GTOA "$header$sequence_G_to_A"; + } + } + else{ + print CTOT "$header$sequence_C_to_T"; + print GTOA "$header$sequence_G_to_A"; + } + } + + if ($directional){ + if ($read_number == 1){ + warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n"; + } + else{ + warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n"; + } + } + else{ + warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + return ($C_to_T_infile); + } + else{ + return ($G_to_A_infile); + } + } + else{ + return ($C_to_T_infile,$G_to_A_infile); + } +} + + +sub biTransformFastQFiles { + my $file = shift; + my ($dir,$filename); + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + if ($pbat){ # PBAT-Seq + if ($gzip){ + $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; + } + else{ + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + } + + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + else{ # directional or non-directional + if ($gzip){ + $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; + } + else{ + $C_to_T_infile =~ s/$/_C_to_T.fastq/; + } + + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option + } + + unless ($directional){ + if ($gzip){ + $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; + } + else{ + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + } + + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + } + + my $count = 0; + while (1){ + my $identifier = ; + my $sequence = ; + my $identifier2 = ; + my $quality_score = ; + last unless ($identifier and $sequence and $identifier2 and $quality_score); + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + ++$count; + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence = uc$sequence; # make input file case insensitive + + # detecting if the input file contains tab stops, as this is likely to result in no alignments + if (index($identifier,"\t") != -1){ + $seqID_contains_tabs++; + } + + ## small check if the sequence file appears to be a FastQ file + if ($count == 1){ + if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ + die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + } + + if ($pbat){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + else{ # directional or non-directional + my $sequence_C_to_T = $sequence; + $sequence_C_to_T =~ tr/C/T/; + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + + unless ($directional){ + my $sequence_G_to_A = $sequence; + $sequence_G_to_A =~ tr/G/A/; + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + } + + if ($directional){ + close CTOT or die "Failed to close filehandle $!\n"; + warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + elsif($pbat){ + warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; + close GTOA or die "Failed to close filehandle $!\n"; + return ($G_to_A_infile); + } + else{ + close CTOT or die "Failed to close filehandle $!\n"; + close GTOA or die "Failed to close filehandle $!\n"; + warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; + } + + return ($C_to_T_infile,$G_to_A_infile); +} + +sub biTransformFastQFiles_paired_end { + my ($file,$read_number) = @_; + my ($dir,$filename); + + if ($file =~ /\//){ + ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file; + } + + ### gzipped version of the infile + if ($file =~ /\.gz$/){ + open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; + } + else{ + open (IN,$file) or die "Couldn't read from file $file: $!\n"; + } + + if ($skip){ + warn "Skipping the first $skip reads from $file\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file\n"; + sleep (1); + } + + my $C_to_T_infile = my $G_to_A_infile = $filename; + + if ($gzip){ + $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; + $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; + } + else{ + $C_to_T_infile =~ s/$/_C_to_T.fastq/; + $G_to_A_infile =~ s/$/_G_to_A.fastq/; + } + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + + if ($directional){ + if ($read_number == 1){ + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + } + } + elsif ($read_number == 2){ + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + if ($gzip){ + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + else{ + die "Read number needs to be 1 or 2, but was $read_number!\n\n"; + } + } + else{ + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; + warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; + if ($gzip){ + open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; + open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; + } + else{ + open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; + open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; + } + } + + my $count = 0; + while (1){ + my $identifier = ; + my $sequence = ; + my $identifier2 = ; + my $quality_score = ; + last unless ($identifier and $sequence and $identifier2 and $quality_score); + ++$count; + + $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence= uc$sequence; # make input file case insensitive + + ## small check if the sequence file appears to be a FastQ file + if ($count == 1){ + if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ + die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + } + my $sequence_C_to_T = my $sequence_G_to_A = $sequence; + + if ($read_number == 1){ + if ($bowtie2){ + $identifier =~ s/$/\/1\/1/; + } + else{ + $identifier =~ s/$/\/1/; + } + } + elsif ($read_number == 2){ + if ($bowtie2){ + $identifier =~ s/$/\/2\/2/; + } + else{ + $identifier =~ s/$/\/2/; + } + } + else{ + die "Read number needs to be 1 or 2\n"; + } + + $sequence_C_to_T =~ tr/C/T/; + $sequence_G_to_A =~ tr/G/A/; + + if ($directional){ + if ($read_number == 1){ + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + } + else{ + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + else{ + print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); + print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); + } + } + + if ($directional){ + if ($read_number == 1){ + warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + else{ + warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; + } + } + else{ + warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; + } + if ($directional){ + if ($read_number == 1){ + close CTOT or die "Failed to close filehandle $!\n"; + return ($C_to_T_infile); + } + else{ + close GTOA or die "Failed to close filehandle $!\n"; + return ($G_to_A_infile); + } + } + else{ + close CTOT or die "Failed to close filehandle $!\n"; + close GTOA or die "Failed to close filehandle $!\n"; + return ($C_to_T_infile,$G_to_A_infile); + } +} + + +### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES + +sub biTransformFastQFiles_paired_end_bowtie1_gzip { + my ($file_1,$file_2) = @_; + my ($dir,$filename); + + if ($file_1 =~ /\//){ + ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/; + } + else{ + $filename = $file_1; + } + + ### gzipped version of infile 1 + if ($file_1 =~ /\.gz$/){ + open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n"; + } + else{ + open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n"; + } + ### gzipped version of infile 2 + if ($file_2 =~ /\.gz$/){ + open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n"; + } + else{ + open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n"; + } + + + if ($skip){ + warn "Skipping the first $skip reads from $file_1 and $file_2\n"; + sleep (1); + } + if ($upto){ + warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n"; + sleep (1); + } + + my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; + + if ($prefix){ + # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; + $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; + $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; + # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; + } + + $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; + $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; + # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; + + warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; + open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; + # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; + + unless ($directional){ + print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n"; + open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n"; + } + + ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format: + ### + + my $count = 0; + while (1){ + my $identifier_1 = ; + my $sequence_1 = ; + my $identifier2_1 = ; + my $quality_score_1 = ; + + my $identifier_2 = ; + my $sequence_2 = ; + my $identifier2_2 = ; + my $quality_score_2 = ; + + last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2); + + ++$count; + + ## small check if the sequence file appears to be a FastQ file + if ($count == 1){ + if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){ + die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){ + die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n"; + } + } + + $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces + chomp $identifier_1; + chomp $sequence_1; + chomp $sequence_2; + chomp $quality_score_1; + chomp $quality_score_2; + + $identifier_1 =~ s/^\@//; + $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever) + + if ($skip){ + next unless ($count > $skip); + } + if ($upto){ + last if ($count > $upto); + } + + $sequence_1 = uc$sequence_1; # make input file 1 case insensitive + $sequence_2 = uc$sequence_2; # make input file 2 case insensitive + + # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n"; + my $sequence_1_C_to_T = $sequence_1; + my $sequence_2_G_to_A = $sequence_2; + $sequence_1_C_to_T =~ tr/C/T/; + $sequence_2_G_to_A =~ tr/G/A/; + + print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n"; + + unless ($directional){ + my $sequence_1_G_to_A = $sequence_1; + my $sequence_2_C_to_T = $sequence_2; + $sequence_1_G_to_A =~ tr/G/A/; + $sequence_2_C_to_T =~ tr/C/T/; + print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n"; + } + } + + close CTPLUSGA or die "Couldn't close filehandle\n"; + warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n"; + + if ($directional){ + warn "\n"; + return ($CT_plus_GA_infile); + } + else{ + close GAPLUSCT or die "Couldn't close filehandle\n"; + warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n"; + return ($CT_plus_GA_infile,$GA_plus_CT_infile); + } +} + + +sub fix_IDs{ + my $id = shift; + $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores + return $id; +} + +sub ensure_sensical_alignment_orientation_single_end{ + my $index = shift; # index number if the sequence produced an alignment + my $strand = shift; + ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one + my $orientation = 0; + ############################################################################################################## + ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T) + ## here we only want reads in the forward (+) orientation + if ($fhs[$index]->{name} eq 'CTreadCTgenome') { + ### if the alignment is (+) we count it, and return 1 for a correct orientation + if ($strand eq '+') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (-) the alignment is nonsensical + elsif ($strand eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A) + ## here we only want reads in the forward (-) orientation + elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') { + ### if the alignment is (-) we count it and return 1 for a correct orientation + if ($strand eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (+) the alignment is nonsensical + elsif ($strand eq '+') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T) + ## here we only want reads in the forward (-) orientation + elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') { + ### if the alignment is (-) we count it and return 1 for a correct orientation + if ($strand eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (+) the alignment is nonsensical + elsif ($strand eq '+') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } + ############################################################################################################### + ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A) + ## here we only want reads in the forward (+) orientation + elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') { + ### if the alignment is (+) we count it and return 1 for a correct orientation + if ($strand eq '+') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the orientation equals (-) the alignment is nonsensical + elsif ($strand eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + } else{ + die "One of the above conditions must be true\n"; + } +} + +sub ensure_sensical_alignment_orientation_paired_ends{ + my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment + ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one + my $orientation = 0; + ############################################################################################################## + ## [Index 0, sequence originated from (converted) forward strand] + ## CT converted read 1 + ## GA converted read 2 + ## CT converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') { + ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation + if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 1, sequence originated from (converted) reverse strand] + ## GA converted read 1 + ## CT converted read 2 + ## GA converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') { + ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation + if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 2, sequence originated from complementary to (converted) forward strand] + ## GA converted read 1 + ## CT converted read 2 + ## CT converted genome + ## here we only want read 1 in (-) orientation and read 2 in (+) orientation + elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') { + ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation + if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + ############################################################################################################### + ## [Index 3, sequence originated from complementary to (converted) reverse strand] + ## CT converted read 1 + ## GA converted read 2 + ## GA converted genome + ## here we only want read 1 in (+) orientation and read 2 in (-) orientation + elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') { + ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation + if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { + $fhs[$index]->{seen}++; + $orientation = 1; + return $orientation; + } + ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical + elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { + $fhs[$index]->{wrong_strand}++; + return $orientation; + } + else{ + die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; + } + } + else{ + die "One of the above conditions must be true\n"; + } +} + +##################################################################################################################################################### + +### Bowtie 1 (default) | PAIRED-END | FASTA + +sub paired_end_align_fragments_to_bisulfite_genome_fastA { + + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + + if ($directional){ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + my $line_1 = $fh->{fh}->getline(); + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 2 | PAIRED-END | FASTA + +sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + if ($directional){ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + my $line_1 = $_; + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 1 (default) | PAIRED-END | FASTQ + +sub paired_end_align_fragments_to_bisulfite_genome_fastQ { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + + if ($directional){ + warn "Input file is $C_to_T_infile_1 (FastQ)\n"; + } + elsif($pbat){ + warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional or $pbat){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; # skipping unwanted filehandles + } + } + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + if ($gzip){ + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n"; + open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!"; + } + else{ + warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n"; + sleep(5); + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + } + + my $line_1 = $fh->{fh}->getline(); + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +### Bowtie 2 | PAIRED-END | FASTQ + +sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { + my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; + if ($directional){ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; + } + else{ + warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; + } + + ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + if ($directional){ + unless ($fh->{inputfile_1}){ + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + next; + } + } + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + my $line_1 = $_; + my $line_2 = $fh->{fh}->getline(); + + # if Bowtie produces an alignment we store the first line of the output + if ($line_1 and $line_2) { + chomp $line_1; + chomp $line_2; + ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. + ### We will thus identify which sequence was read 1 and store this ID as last_seq_id + + my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) + my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line + + if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with + $fh->{last_seq_id} = $id_1; + } + elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present + $fh->{last_seq_id} = $id_2; + } + else{ + die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; + } + + $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 + $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 + warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; + } + + # otherwise we just initialise last_seq_id and last_lines as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line_1} = undef; + $fh->{last_line_2} = undef; + } + } +} + +##################################################################################################################################################### + +### Bowtie 1 (default) | SINGLE-END | FASTA +sub single_end_align_fragments_to_bisulfite_genome_fastA { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; + if ($gzip){ + open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; + } + else{ + open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data + } + + # if Bowtie produces an alignment we store the first line of the output + $_ = $fh->{fh}->getline(); + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +### Bowtie 2 | SINGLE-END | FASTA +sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastA)\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; + } + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + + warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n"; + open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; + + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else{ + last; # no alignment output + } + } + + # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + + +### Bowtie 1 (default) | SINGLE-END | FASTQ +sub single_end_align_fragments_to_bisulfite_genome_fastQ { + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastQ)\n"; + } + elsif($pbat){ + warn "Input file is $G_to_A_infile (FastQ)\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n"; + } + + + ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## the data structure above + if ($directional or $pbat){ + warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + + foreach my $fh (@fhs) { + my $bt_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt_options .= ' --nofw'; + } + + warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; + sleep (5); + + if ($gzip){ + open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; + } + else{ + open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data + } + + # if Bowtie produces an alignment we store the first line of the output + $_ = $fh->{fh}->getline(); + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +### Bowtie 2 | SINGLE-END | FASTQ +sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { + + my ($C_to_T_infile,$G_to_A_infile) = @_; + if ($directional){ + warn "Input file is $C_to_T_infile (FastQ)\n\n"; + } + else{ + warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n"; + } + + ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in + ## the data structure above + if ($directional){ + warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + else{ + warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; + } + foreach my $fh (@fhs) { + my $bt2_options = $bowtie_options; + if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ + $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner + } + else { + $bt2_options .= ' --nofw'; + } + warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n"; + warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n"; + + open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; + ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence + while (1){ + $_ = $fh->{fh}->getline(); + # warn "$_\n"; + # sleep(1); + if ($_) { + last unless ($_ =~ /^\@/); # SAM headers start with @ + } + else { + last; + } + } + + # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output + if ($_) { + chomp; + my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier) + $fh->{last_seq_id} = $id; + $fh->{last_line} = $_; + warn "Found first alignment:\t$fh->{last_line}\n"; + # warn "storing $id and\n$_\n"; + } + # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output + else { + warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; + $fh->{last_seq_id} = undef; + $fh->{last_line} = undef; + } + } +} + +########################################################################################################################################### + +sub reset_counters_and_fhs{ + my $filename = shift; + %counting=( + total_meCHH_count => 0, + total_meCHG_count => 0, + total_meCpG_count => 0, + total_meC_unknown_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CpG_count => 0, + total_unmethylated_C_unknown_count => 0, + sequences_count => 0, + no_single_alignment_found => 0, + unsuitable_sequence_count => 0, + genomic_sequence_could_not_be_extracted_count => 0, + unique_best_alignment_count => 0, + low_complexity_alignments_overruled_count => 0, + CT_CT_count => 0, #(CT read/CT genome, original top strand) + CT_GA_count => 0, #(CT read/GA genome, original bottom strand) + GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand) + GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand) + CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand) + GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand) + GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand) + CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand) + alignments_rejected_count => 0, # only relevant if --directional was specified + ); + + if ($directional){ + if ($filename =~ ','){ # paired-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + else{ # single-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + } + elsif($pbat){ + if ($filename =~ ','){ # paired-end files + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + else{ # single-end files + @fhs=( + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } + } + else{ + @fhs=( + { name => 'CTreadCTgenome', + strand_identity => 'con ori forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'CTreadGAgenome', + strand_identity => 'con ori reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadCTgenome', + strand_identity => 'compl ori con forward', + bisulfiteIndex => $CT_index_basename, + seen => 0, + wrong_strand => 0, + }, + { name => 'GAreadGAgenome', + strand_identity => 'compl ori con reverse', + bisulfiteIndex => $GA_index_basename, + seen => 0, + wrong_strand => 0, + }, + ); + } +} + + +sub process_command_line{ + my @bowtie_options; + my $help; + my $mates1; + my $mates2; + my $path_to_bowtie; + my $fastq; + my $fasta; + my $skip; + my $qupto; + my $phred64; + my $phred33; + my $solexa; + my $mismatches; + my $seed_length; + my $best; + my $sequence_format; + my $version; + my $quiet; + my $chunk; + my $non_directional; + my $ceiling; + my $maxins; + my $minins; + my $unmapped; + my $multi_map; + my $output_dir; + my $bowtie2; + my $vanilla; + my $sam_no_hd; + my $seed_extension_fails; + my $reseed_repetitive_seeds; + my $most_valid_alignments; + my $score_min; + my $parallel; + my $temp_dir; + my $rdg; + my $rfg; + my $non_bs_mm; + my $samtools_path; + my $bam; + my $gzip; + my $pbat; + my $prefix; + my $old_flag; + + my $command_line = GetOptions ('help|man' => \$help, + '1=s' => \$mates1, + '2=s' => \$mates2, + 'path_to_bowtie=s' => \$path_to_bowtie, + 'f|fasta' => \$fasta, + 'q|fastq' => \$fastq, + 's|skip=i' => \$skip, + 'u|upto=i' => \$qupto, + 'phred33-quals' => \$phred33, + 'phred64-quals|solexa1' => \$phred64, + 'solexa-quals' => \$solexa, + 'n|seedmms=i' => \$mismatches, + 'l|seedlen=i' => \$seed_length, + 'no_best' => \$best, + 'version' => \$version, + 'quiet' => \$quiet, + 'chunkmbs=i' => \$chunk, + 'non_directional' => \$non_directional, + 'I|minins=i' => \$minins, + 'X|maxins=i' => \$maxins, + 'e|maqerr=i' => \$ceiling, + 'un|unmapped' => \$unmapped, + 'ambiguous' => \$multi_map, + 'o|output_dir=s' => \$output_dir, + 'bowtie2' => \$bowtie2, + 'vanilla' => \$vanilla, + 'sam-no-hd' => \$sam_no_hd, + 'D=i' => \$seed_extension_fails, + 'R=i' => \$reseed_repetitive_seeds, + 'score_min=s' => \$score_min, + 'most_valid_alignments=i' => \$most_valid_alignments, + 'p=i' => \$parallel, + 'temp_dir=s' => \$temp_dir, + 'rdg=s' => \$rdg, + 'rfg=s' => \$rfg, + 'non_bs_mm' => \$non_bs_mm, + 'samtools_path=s' => \$samtools_path, + 'bam' => \$bam, + 'gzip' => \$gzip, + 'pbat' => \$pbat, + 'prefix=s' => \$prefix, + 'old_flag' => \$old_flag, + ); + + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + if ($version){ + print << "VERSION"; + + + Bismark - Bisulfite Mapper and Methylation Caller. + + Bismark Version: $bismark_version + Copyright 2010-13 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/ + + +VERSION + exit; + } + + + ########################## + ### PROCESSING OPTIONS ### + ########################## + + unless ($bowtie2){ + $bowtie2 = 0; + } + unless ($sam_no_hd){ + $sam_no_hd =0; + } + + ### PATH TO BOWTIE + ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH + if ($path_to_bowtie){ + unless ($path_to_bowtie =~ /\/$/){ + $path_to_bowtie =~ s/$/\//; + } + if (-d $path_to_bowtie){ + if ($bowtie2){ + $path_to_bowtie = "${path_to_bowtie}bowtie2"; + } + else{ + $path_to_bowtie = "${path_to_bowtie}bowtie"; + } + } + else{ + die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n"; + } + } + else{ + if ($bowtie2){ + $path_to_bowtie = 'bowtie2'; + warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; } + else{ + $path_to_bowtie = 'bowtie'; + warn "Path to Bowtie specified as: $path_to_bowtie\n"; + } + } + + ### OUTPUT REQUESTED AS BAM FILE + if ($bam){ + if ($vanilla){ + die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n"; + } + + ### PATH TO SAMTOOLS + if (defined $samtools_path){ + # if Samtools was specified as full command + if ($samtools_path =~ /samtools$/){ + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + else{ + unless ($samtools_path =~ /\/$/){ + $samtools_path =~ s/$/\//; + } + $samtools_path .= 'samtools'; + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + + warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n"; + $bam = 1; + } + # Check whether Samtools is in the PATH if no path was supplied by the user + else{ + if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH + $samtools_path = `which samtools`; + chomp $samtools_path; + warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n"; + $bam = 1; + } + } + + unless (defined $samtools_path){ + $bam = 2; + warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n"; + } + sleep (1); + } + + + #################################### + ### PROCESSING ARGUMENTS + + ### GENOME FOLDER + my $genome_folder = shift @ARGV; # mandatory + unless ($genome_folder){ + warn "Genome folder was not specified!\n"; + print_helpfile(); + exit; + } + + ### checking that the genome folder, all subfolders and the required bowtie index files exist + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + + if (chdir $genome_folder){ + my $absolute_genome_folder = getcwd; ## making the genome folder path absolute + unless ($absolute_genome_folder =~/\/$/){ + $absolute_genome_folder =~ s/$/\//; + } + warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n"; + $genome_folder = $absolute_genome_folder; + } + else{ + die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] {-1 -2 | } [] (--help for more details)\n"; + } + + my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/"; + my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/"; + + if ($bowtie2){ ### Bowtie 2 (new) + ### checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n"; + } + } + ### checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n"; + } + } + } + + else{ ### Bowtie 1 (default) + ### checking the integrity of $CT_dir + chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; + my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); + foreach my $file(@CT_bowtie_index){ + unless (-f $file){ + die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; + } + } + ### checking the integrity of $GA_dir + chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; + my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); + foreach my $file(@GA_bowtie_index){ + unless (-f $file){ + die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; + } + } + } + + my $CT_index_basename = "${CT_dir}BS_CT"; + my $GA_index_basename = "${GA_dir}BS_GA"; + + ### INPUT OPTIONS + + ### SEQUENCE FILE FORMAT + ### exits if both fastA and FastQ were specified + if ($fasta and $fastq){ + die "Only one sequence filetype can be specified (fastA or fastQ)\n"; + } + + ### unless fastA is specified explicitely, fastQ sequence format is expected by default + if ($fasta){ + print "FastA format specified\n"; + $sequence_format = 'FASTA'; + push @bowtie_options, '-f'; + } + elsif ($fastq){ + print "FastQ format specified\n"; + $sequence_format = 'FASTQ'; + push @bowtie_options, '-q'; + } + else{ + $fastq = 1; + print "FastQ format assumed (by default)\n"; + $sequence_format = 'FASTQ'; + push @bowtie_options, '-q'; + } + + ### SKIP + if ($skip){ + warn "Skipping the first $skip reads from the input file\n"; + # push @bowtie_options,"-s $skip"; + } + + ### UPTO + if ($qupto){ + warn "Processing sequences up to read no. $qupto from the input file\n"; + if ($bowtie2){ + # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2 + } + else{ + # push @bowtie_options,"--qupto $qupto"; + } + } + + ### QUALITY VALUES + if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){ + die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)"; + } + if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2. + # Phred quality values work only when -q is specified + unless ($fastq){ + die "Phred quality values works only when -q (FASTQ) is specified\n"; + } + if ($bowtie2){ + push @bowtie_options,"--phred33"; + } + else{ + push @bowtie_options,"--phred33-quals"; + } + } + if ($phred64){ + # Phred quality values work only when -q is specified + unless ($fastq){ + die "Phred quality values work only when -q (FASTQ) is specified\n"; + } + if ($bowtie2){ + push @bowtie_options,"--phred64"; + } + else{ + push @bowtie_options,"--phred64-quals"; + } + } + else{ + $phred64 = 0; + } + + if ($solexa){ + if ($bowtie2){ + die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n"; + } + # Solexa to Phred value conversion works only when -q is specified + unless ($fastq){ + die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n"; + } + push @bowtie_options,"--solexa-quals"; + } + else{ + $solexa = 0; + } + + ### ALIGNMENT OPTIONS + + ### MISMATCHES + if (defined $mismatches){ + if ($bowtie2){ + if ($mismatches == 0 or $mismatches == 1){ + push @bowtie_options,"-N $mismatches"; + } + else{ + die "Please set the number of multiseed mismatches for Bowtie 2 with '-N ' (where can be 0 or 1)\n"; + } + } + else{ + if ($mismatches >= 0 and $mismatches <= 3){ + push @bowtie_options,"-n $mismatches"; + } + else{ + die "Please set the number of seed mismatches for Bowtie 1 with '-n ' (where can be 0,1,2 or 3)\n"; + } + } + } + else{ + unless ($bowtie2){ + push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2 + } + } + + ### SEED LENGTH + if (defined $seed_length){ + if ($bowtie2){ + push @bowtie_options,"-L $seed_length"; + } + else{ + push @bowtie_options,"-l $seed_length"; + } + } + + ### MISMATCH CEILING + if (defined $ceiling){ + die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2); + push @bowtie_options,"-e $ceiling"; + } + + + ### BOWTIE 2 EFFORT OPTIONS + + ### CONSECUTIVE SEED EXTENSION FAILS + if (defined $seed_extension_fails){ + die "The option '-D ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + push @bowtie_options,"-D $seed_extension_fails"; + } + + ### RE-SEEDING REPETITIVE SEEDS + if (defined $reseed_repetitive_seeds){ + die "The option '-R ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + push @bowtie_options,"-R $reseed_repetitive_seeds"; + } + + + ### BOWTIE 2 SCORING OPTIONS + if ($score_min){ + die "The option '--score_min ' is only available when using Bowtie 2\n\n" unless ($bowtie2); + unless ($score_min =~ /^L,.+,.+$/){ + die "The option '--score_min ' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + push @bowtie_options,"--score-min $score_min"; + } + else{ + if ($bowtie2){ + push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2 + } + } + + ### BOWTIE 2 READ GAP OPTIONS + my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend); + + if ($rdg){ + die "The option '--rdg ,' is only available when using Bowtie 2\n\n" unless ($bowtie2); + if ($rdg =~ /^(\d+),(\d+)$/){ + $deletion_open = $1; + $deletion_extend = $2; + } + else{ + die "The option '--rdg ,' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + push @bowtie_options,"--rdg $rdg"; + } + else{ + $deletion_open = 5; + $deletion_extend = 3; + } + + ### BOWTIE 2 REFERENCE GAP OPTIONS + if ($rfg){ + die "The option '--rfg ,' is only available when using Bowtie 2\n\n" unless ($bowtie2); + if ($rfg =~ /^(\d+),(\d+)$/){ + $insertion_open = $1; + $insertion_extend = $2; + } + else{ + die "The option '--rfg ,' needs to be in the format . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; + } + push @bowtie_options,"--rfg $rfg"; + } + else{ + $insertion_open = 5; + $insertion_extend = 3; + } + + + ### BOWTIE 2 PARALLELIZATION OPTIONS + if (defined $parallel){ + die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2); + } + if ($bowtie2){ + if ($parallel){ + die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); + push @bowtie_options,"-p $parallel"; + push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work. + print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n"; + sleep (2); + } + } + + ### REPORTING OPTIONS + + if ($bowtie2){ + push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default + + ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while + if(defined $most_valid_alignments){ + + warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n"; + # push @bowtie_options,"-M $most_valid_alignments";sleep (5); + } + # else{ + # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence + # } + } + else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1 + push @bowtie_options,'-k 2'; + } + + ### --BEST + if ($bowtie2){ + if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used + die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n"; + } + } + else{ + # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process) + unless ($best){ + push @bowtie_options,'--best'; + } + } + + ### VANILLA BISMARK (BOWTIE 1) OUTPUT + if ($vanilla){ + if ($bowtie2){ + die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n"; + } + } + else{ + $vanilla = 0; + } + + ### PAIRED-END MAPPING + if ($mates1){ + my @mates1 = (split (/,/,$mates1)); + die "Paired-end mapping requires the format: -1 -2 , please respecify!\n" unless ($mates2); + my @mates2 = (split(/,/,$mates2)); + unless (scalar @mates1 == scalar @mates2){ + die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 -2 )\n"; + } + while (1){ + my $mate1 = shift @mates1; + my $mate2 = shift @mates2; + last unless ($mate1 and $mate2); + push @filenames,"$mate1,$mate2"; + } + if ($bowtie2){ + push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments + push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones + } + + if ($old_flag){ + warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); + sleep(3); + } + } + elsif ($mates2){ + die "Paired-end mapping requires the format: -1 -2 , please respecify!\n"; + } + + ### SINGLE-END MAPPING + # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified + my $singles; + unless ($mates1 and $mates2){ + $singles = join (',',@ARGV); + unless ($singles){ + die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n"; + } + $singles =~ s/\s/,/g; + @filenames = (split(/,/,$singles)); + warn "\nFiles to be analysed:\n"; + warn "@filenames\n\n"; + sleep (3); + } + + ### MININUM INSERT SIZE (PAIRED-END ONLY) + if (defined $minins){ + die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles); + push @bowtie_options,"--minins $minins"; + } + + ### MAXIMUM INSERT SIZE (PAIRED-END ONLY) + if (defined $maxins){ + die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles); + push @bowtie_options,"--maxins $maxins"; + } + else{ + unless ($singles){ + push @bowtie_options,'--maxins 500'; + } + } + + ### QUIET prints nothing besides alignments (suppresses warnings) + if ($quiet){ + push @bowtie_options,'--quiet'; + } + + ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments + unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option + if (defined $chunk){ + push @bowtie_options,"--chunkmbs $chunk"; + } + else{ + push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default) + } + } + + + ### SUMMARY OF ALL BOWTIE OPTIONS + my $bowtie_options = join (' ',@bowtie_options); + + + ### STRAND-SPECIFIC LIBRARIES + my $directional; + if ($non_directional){ + die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat); + warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n"; + sleep (3); + $directional = 0; + } + elsif($pbat){ + die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip); + die "The option --pbat is currently not working for Bowtie 2. Please run alignments in default (i.e. Bowtie 1) mode!\n" if ($bowtie2); + die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta); + + warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n"; + sleep (3); + $directional = 0; + } + else{ + warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n"; + sleep (3); + $directional = 1; # default behaviour + } + + ### UNMAPPED SEQUENCE OUTPUT + $unmapped = 0 unless ($unmapped); + + ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT + $multi_map = 0 unless ($multi_map); + + + ### OUTPUT DIRECTORY + + chdir $parent_dir or die "Failed to move back to current working directory\n"; + if ($output_dir){ + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + + if (chdir $output_dir){ + $output_dir = getcwd; # making the path absolute + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + else{ + mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; + warn "Created output directory $output_dir!\n\n"; + chdir $output_dir or die "Failed to move to $output_dir\n"; + $output_dir = getcwd; # making the path absolute + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + warn "Output will be written into the directory: $output_dir\n"; + } + else{ + $output_dir = ''; + } + + ### TEMPORARY DIRECTORY for C->T and G->A transcribed files + + chdir $parent_dir or die "Failed to move back to current working directory\n"; + if ($temp_dir){ + warn "\nUsing temp directory: $temp_dir\n"; + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + + if (chdir $temp_dir){ + $temp_dir = getcwd; # making the path absolute + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + } + else{ + mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n"; + warn "Created temporary directory $temp_dir!\n\n"; + chdir $temp_dir or die "Failed to move to $temp_dir\n"; + $temp_dir = getcwd; # making the path absolute + unless ($temp_dir =~ /\/$/){ + $temp_dir =~ s/$/\//; + } + } + warn "Temporary files will be written into the directory: $temp_dir\n"; + } + else{ + $temp_dir = ''; + } + + ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE + if ($non_bs_mm){ + if ($vanilla){ + die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; + } + } + + ### PREFIX FOR OUTPUT FILES + if ($prefix){ + # removing trailing dots + + $prefix =~ s/\.+$//; + + warn "Using the following prefix for output files: $prefix\n\n"; + sleep(1); + } + + + return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag); +} + + + +sub generate_SAM_header{ + print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order + foreach my $chr (keys %chromosomes){ + my $length = length ($chromosomes{$chr}); + print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length + } + print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version +} + +### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format: +### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011) + +sub single_end_SAM_output{ + my ($id,$actual_seq,$methylation_call_params,$qual) = @_; + my $strand = $methylation_call_params->{$id}->{alignment_strand}; + my $chr = $methylation_call_params->{$id}->{chromosome}; + my $start = $methylation_call_params->{$id}->{position}; + my $stop = $methylation_call_params->{$id}->{end_position}; + my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence}; + my $methcall = $methylation_call_params->{$id}->{methylation_call}; + my $read_conversion = $methylation_call_params->{$id}->{read_conversion}; + my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; + my $number_of_mismatches; + if ($bowtie2){ + $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score}; + } + else{ + $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches}; + } + + ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" + ## FLAG: bitwise FLAG. Each bit is explained in the following table: + ## Bit Description Comment Value + ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1) + ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2) + ## 0x4 segment unmapped --- --- + ## 0x8 next segment in the template unmapped --- --- + ## 0x10 SEQ being reverse complemented value: 2**4 ( 16) + ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32) + ## 0x40 the first segment in the template read 1 value: 2**6 ( 64) + ## 0x80 the last segment in the template read 2 value: 2**7 (128) + ## 0x100 secondary alignment --- --- + ## 0x200 not passing quality controls --- --- + ## 0x400 PCR or optical duplicate --- --- + + ##### + + my $flag; # FLAG variable used for SAM format. + if ($strand eq "+"){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ + $flag = 0; # 0 for "+" strand (OT) + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ + $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand) + } + else{ + die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; + } + } + elsif ($strand eq "-"){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ + $flag = 16; # 16 for "-" strand (OB) + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ + $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand) + } + else{ + die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; + } + } + else{ + die "Unexpected strand information: $strand\n\n"; + } + + ##### + + my $mapq = 255; # Assume mapping quality is unavailable + + ##### + + my $cigar; + if ($bowtie2){ + $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2 + } + else{ + $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches) + } + + ##### + + my $rnext = "*"; # Paired-end variable + + ##### + + my $pnext = 0; # Paired-end variable + + ##### + + my $tlen = 0; # Paired-end variable + + ##### + + if ($read_conversion eq 'CT'){ + $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands + } + else{ + $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries + } + + if ($strand eq '-'){ + $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand + $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence + $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well + } + + ##### + + my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string + # into the reference string. hemming_dist() + if ($bowtie2){ + $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + } + + my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences + + ##### + + my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!) + + ##### + + my $XM_tag; # Optional tag XM: Methylation Call String + if ($strand eq '+'){ + $XM_tag = "XM:Z:$methcall"; + } + elsif ($strand eq '-'){ + $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well + } + + ##### + + my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion + + ##### + + my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion + + ##### + + # Optionally calculating number of mismatches for Bowtie 2 alignments + + if ($non_bs_mm) { + if ($bowtie2) { + + $number_of_mismatches =~ s/-//; # removing the minus sign + + ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches + if ($cigar =~ /(D|I)/) { + # warn "$cigar\n"; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach (0..$#len) { + if ($ops[$_] eq 'M') { + # warn "skipping\n"; + next; # irrelevant + } + elsif ($ops[$_] eq 'I') { # insertion in the read sequence + $number_of_mismatches -= $insertion_open; + $number_of_mismatches -= $len[$_] * $insertion_extend; + # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; + } + elsif ($ops[$_] eq 'D') { # deletion in the read sequence + $number_of_mismatches -= $deletion_open; + $number_of_mismatches -= $len[$_] * $deletion_extend; + # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; + } + elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + else { + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; + } + } + # warn "Alignment score $number_of_mismatches\n"; + # print "Mismatches $number_of_mismatches\n\n"; + } + ### Now we have InDel corrected alignment scores + + ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the + ### sequence contained more than 5 Ns, but this should occur close to never + + my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division + # warn "N count: $seq_N_count\n"; + $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count; + # warn "MM $number_of_mismatches\n"; + } + } + + #### + + my $XA_tag = "XA:Z:$number_of_mismatches"; + + ##### + + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + ### optionally print number of non-bisulfite mismatches + if ($non_bs_mm){ + print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n"; + } + else{ # default + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n"; + } +} + +sub paired_end_SAM_output{ + my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_; + my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand + my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2}; + my $chr = $methylation_call_params->{$id}->{chromosome}; + my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1}; + my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2}; + my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1}; + my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; + my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; + my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; + my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; + + my $id_1; + my $id_2; + + if ($old_flag){ + $id_1 = $id.'/1'; + $id_2 = $id.'/2'; + } + else{ + $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard + $id_2 = $id; + } + + # Allows all degenerate nucleotide sequences in reference genome + die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i; + die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i; + + my $index; # used to store the srand origin of the alignment in a less convoluted way + + if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){ + $index = 0; ## this is OT (original top strand) + } + elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){ + $index = 1; ## this is CTOB (complementary to OB) + } + elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){ + $index = 2; ## this is CTOT (complementary to OT) + } + elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){ + $index = 3; ## this is OB (original bottom) + } + else { + die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n"; + } + + my $number_of_mismatches_1; + my $number_of_mismatches_2; + + if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine + $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default! + $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2}; + } + else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation + if ($index == 2 or $index == 3){ # CTOT or OB + $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default! + $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1}; + } + else{ # if the first read aligned in forward direction it is like for Bowtie 2 + $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default! + $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2}; + } + } + + + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the + ### first or last position. + + if ($index == 0 or $index == 3){ # OT or OB + $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2); + $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2); + } + else{ # CTOT or CTOB + $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2); + $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2); + } + + ##### + + my $start_read_1; + my $start_read_2; + # adjusting end positions + + if ($bowtie2){ + $start_read_1 = $methylation_call_params->{$id}->{position_1}; + $start_read_2 = $methylation_call_params->{$id}->{position_2}; + } + else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 + if ($strand_1 eq '+'){ # Read 1 aligns to the + strand + $start_read_1 = $methylation_call_params->{$id}->{start_seq_1}; + $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1; + } + else{ # read 1 is on the - strand + $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1; + $start_read_2 = $methylation_call_params->{$id}->{start_seq_1}; + } + } + + ##### + + my $end_read_1; + my $end_read_2; + # adjusting end positions + + if ($bowtie2){ + $end_read_1 = $methylation_call_params->{$id}->{end_position_1}; + $end_read_2 = $methylation_call_params->{$id}->{end_position_2}; + } + else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 + if ($strand_1 eq '+'){ # Read 1 aligns to the + strand + $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1; + $end_read_2 = $methylation_call_params->{$id}->{alignment_end}; + } + else{ + $end_read_1 = $methylation_call_params->{$id}->{alignment_end}; + $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1; + } + } + + ##### + + ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" + ## FLAG: bitwise FLAG. Each bit is explained in the following table: + ## Bit Description Comment Value + ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1) + ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2) + ## 0x4 segment unmapped --- --- + ## 0x8 next segment in the template unmapped --- --- + ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16) + ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32) + ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64) + ## 0x80 the last segment in the template read 2 value: 2^^7 (128) + ## 0x100 secondary alignment --- --- + ## 0x200 not passing quality controls --- --- + ## 0x400 PCR or optical duplicate --- --- + + ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account + + # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) + # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) + + my $flag_1; # FLAG variable used for SAM format + my $flag_2; + + ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH) + + if ($index == 0){ # OT + unless ($old_flag){ + $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) + $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) + } + else{ + $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) + $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) + } + } + elsif ($index == 1){ # CTOB + unless($old_flag){ + $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) + $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) + } + else{ + $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) + } + } + elsif ($index == 2){ # CTOT + unless ($old_flag){ + $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) + $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) + } + else{ + $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) + $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) + } + } + elsif ($index == 3){ # OB + unless ($old_flag){ + $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) + $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) + } + else{ + $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) + } + } + + ##### + + my $mapq = 255; # Mapping quality is unavailable + + ##### + + my $cigar_1; + my $cigar_2; + + if ($bowtie2){ + $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2 + $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2}; + } + else{ + $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches) + $cigar_2 = length($actual_seq_2) . "M"; + } + + ##### + + my $rnext = '='; # Chromosome of mate; applies to both reads + + ##### + + my $pnext_1 = $start_read_2; # Leftmost position of mate + my $pnext_2 = $start_read_1; + + ##### + + my $tlen_1; # signed observed Template LENgth (or inferred fragment size) + my $tlen_2; + + if ($bowtie2){ + + if ($start_read_1 <= $start_read_2){ + + # Read 1 alignment is leftmost + + if ($end_read_2 >= $end_read_1){ + + # -------------------------> read 1 reads overlapping + # <------------------------- read 2 + # + # or + # + # -------------------------> read 1 + # <----------------------- read 2 read 2 contained within read 1 + # + # or + # + # -------------------------> read 1 reads 1 and 2 exactly overlapping + # <------------------------- read 2 + # + + # dovetailing of reads is not enabled for Bowtie 2 alignments + + $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, + $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign + } + elsif ($end_read_2 < $end_read_1){ + + # -------------------------> read 1 + # <----------- read 2 read 2 contained within read 1 + # + # or + # + # -------------------------> read 1 + # <------------------------ read 2 read 2 contained within read 1 + + # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable + $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, + $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this + ### as a request by frozenlyse on SeqAnswers on 24 July 2013 + } + + } + + elsif ($start_read_2 < $start_read_1){ + + if ($end_read_1 >= $end_read_2){ + + # Read 2 alignment is leftmost + + # -------------------------> read 2 reads overlapping + # <------------------------- read 1 + # + # or + # + # -------------------------> read 2 + # <----------------------- read 1 read 1 contained within read 2 + # + # + + $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, + $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign + } + elsif ($end_read_1 < $end_read_2){ + + # -------------------------> read 2 + # <----------- read 1 read 1 contained within read 2 + # + # or + # + # -------------------------> read 2 + # <------------------------ read 1 read 1 contained within read 2 + + # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable + $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, + $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this + ### as a request by frozenlyse on SeqAnswers on 24 July 2013 + } + } + } + + else{ # Bowtie 1 + + if ($end_read_2 >= $end_read_1){ + # Read 1 alignment is leftmost + # -------------------------> read 1 + # <------------------------- read 2 + # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing + + $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, + $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign + } + else{ + # Read 2 alignment is leftmost + # -------------------------> read 2 + # <------------------------- read 1 + # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing + + $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, + $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign + } + } + + ##### + + # adjusting the strand of the sequence before we use them to generate mismatch strings + if ($strand_1 eq '-'){ + $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand + $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence + $qual_1 = reverse $qual_1; # we need to reverse the quality string as well + } + if ($strand_2 eq '-'){ + $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand + $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence + $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well + } + + # print "$actual_seq_1\n$ref_seq_1\n\n"; + # print "$actual_seq_2\n$ref_seq_2\n\n"; + + ##### + + my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence + my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2); + if ($bowtie2){ + $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence + } + my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences + my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences + + ##### + + my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!) + my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2); + + ##### + + my $XM_tag_1; # Optional tag XM: Methylation call string + my $XM_tag_2; + + if ($strand_1 eq '-'){ + $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented + } + else{ + $XM_tag_1 = "XM:Z:$methcall_1"; + } + + if ($strand_2 eq '-'){ + $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented + } + else{ + $XM_tag_2 = "XM:Z:$methcall_2"; + } + + ##### + + my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state + my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state + + ##### + + my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads + + ##### + + # Optionally calculating number of mismatches for Bowtie 2 alignments + + if ($non_bs_mm) { + if ($bowtie2) { + + $number_of_mismatches_1 =~ s/-//; # removing the minus sign + $number_of_mismatches_2 =~ s/-//; + + ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches + + ### CIGAR 1 + if ($cigar_1 =~ /(D|I)/) { + # warn "$cigar_1\n"; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar_1); # storing the length per operation + my @ops = split (/\d+/,$cigar_1); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach (0..$#len) { + if ($ops[$_] eq 'M') { + # warn "skipping\n"; + next; # irrelevant + } + elsif ($ops[$_] eq 'I') { # insertion in the read sequence + $number_of_mismatches_1 -= $insertion_open; + $number_of_mismatches_1 -= $len[$_] * $insertion_extend; + # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; + } + elsif ($ops[$_] eq 'D') { # deletion in the read sequence + $number_of_mismatches_1 -= $deletion_open; + $number_of_mismatches_1 -= $len[$_] * $deletion_extend; + # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; + } + elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + else { + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; + } + } + + # warn "Alignment score $number_of_mismatches_1\n"; + # print "Mismatches $number_of_mismatches_1\n\n"; + } + + ### CIGAR 2 + if ($cigar_2 =~ /(D|I)/) { + # warn "$cigar_2\n"; + + # parsing CIGAR string + my @len = split (/\D+/,$cigar_2); # storing the length per operation + my @ops = split (/\d+/,$cigar_2); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach (0..$#len) { + if ($ops[$_] eq 'M') { + # warn "skipping\n"; + next; #irrelevant + } + elsif ($ops[$_] eq 'I') { # insertion in the read sequence + $number_of_mismatches_2 -= $insertion_open; + $number_of_mismatches_2 -= $len[$_] * $insertion_extend; + # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; + } + elsif ($ops[$_] eq 'D') { # deletion in the read sequence + $number_of_mismatches_2 -= $deletion_open; + $number_of_mismatches_2 -= $len[$_] * $deletion_extend; + # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; + } + elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die + die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + else { + die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; + } + } + } + + ### Now we have InDel corrected Alignment scores + + ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the + ### sequence contained more than 5 Ns, but this should occur close to never + + my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division + my $seq_2_N_count = $number_of_mismatches_2 % 6; + # warn "N count 1: $seq_1_N_count\n"; + # warn "N count 2: $seq_2_N_count\n"; + + $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count; + $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count; + + # warn "MM1 $number_of_mismatches_1 \n"; + # warn "MM2 $number_of_mismatches_2 \n"; + } + } + + #### + + my $XA_tag = "XA:Z:$number_of_mismatches_1"; + my $XB_tag = "XB:Z:$number_of_mismatches_2"; + + + # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields + ### optionally print number of non-bisulfite mismatches + if ($non_bs_mm){ + print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n"; + print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n"; + } + else{ # default + print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n"; + print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n"; + } +} + +sub revcomp{ + my $seq = shift or die "Missing seq to reverse complement\n"; + $seq = reverse $seq; + $seq =~ tr/ACTGactg/TGACTGAC/; + return $seq; +} + +sub hemming_dist{ + my $matches = 0; + my @actual_seq = split //,(shift @_); + my @ref_seq = split //,(shift @_); + foreach (0..$#actual_seq){ + ++$matches if ($actual_seq[$_] eq $ref_seq[$_]); + } + return my $hd = scalar @actual_seq - $matches; +} + +sub make_mismatch_string{ + my $actual_seq = shift or die "Missing actual sequence"; + my $ref_seq = shift or die "Missing reference sequence"; + my $XX_tag = "XX:Z:"; + my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison + my $prev_mm_pos = 0; + while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference + my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch + my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read + $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other + $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation) + $prev_mm_pos = pos($tmp); # Position of last mismatch + } + my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence + $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence + return $XX_tag; +} + + + +sub print_helpfile{ + print << "HOW_TO"; + + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + + +DESCRIPTION + + +The following is a brief description of command line options and arguments to control the Bismark +bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the +reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand +version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). +Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome +(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the +forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) +are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original +sequence from the genome and determine if there were any protected C's present or not. + +As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be +re-enabled by using --non_directional. + +The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old +'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best +alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below. + + +USAGE: bismark [options] {-1 -2 | } + + +ARGUMENTS: + + The path to the folder containing the unmodified reference genome + as well as the subfolders created by the Bismark_Genome_Preparation + script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/). + Bismark expects one or more fastA files in this folder (file extension: .fa + or .fasta). The path can be relative or absolute. + +-1 Comma-separated list of files containing the #1 mates (filename usually includes + "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must + correspond file-for-file and read-for-read with those specified in . + Reads may be a mix of different lengths. Bismark will produce one mapping result + and one report file per paired-end input file pair. + +-2 Comma-separated list of files containing the #2 mates (filename usually includes + "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must + correspond file-for-file and read-for-read with those specified in . + Reads may be a mix of different lengths. + + A comma- or space-separated list of files containing the reads to be aligned (e.g. + lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will + produce one mapping result and one report file per input file. + + +OPTIONS: + + +Input: + +-q/--fastq The query input files (specified as , or are FASTQ + files (usually having extension .fg or .fastq). This is the default. See also + --solexa-quals. + +-f/--fasta The query input files (specified as , or are FASTA + files (usually havin extension .fa, .mfa, .fna or similar). All quality values + are assumed to be 40 on the Phred scale. FASTA files are expected to contain both + the read name and the sequence on a single line (and not spread over several lines). + +-s/--skip Skip (i.e. do not align) the first reads or read pairs from the input. + +-u/--upto Only aligns the first reads or read pairs from the input. Default: no limit. + +--phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. + +--phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. + +--solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled + (which can't). The formula for conversion is: + phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This + is usually the right option for use with (unconverted) reads emitted by the GA + Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. + +--solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) + reads emitted by GA Pipeline version 1.3 or later. Default: off. + +--path_to_bowtie The full path to the Bowtie (1 or 2) installation on your system. If not + specified it is assumed that Bowtie (1 or 2) is in the PATH. + + +Alignment: + +-n/--seedmms The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs + of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the + default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). + +-l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to + which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for + larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). + +-e/--maqerr Maximum permitted total of quality values at all mismatched read positions throughout + the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds + quality values to the nearest 10 and saturates at 30. This value is not relevant for + Bowtie 2. + +--chunkmbs The number of megabytes of memory a given thread is given to store path descriptors in + --best mode. Best-first search must keep track of many paths at once to ensure it is + always extending the path with the lowest cumulative cost. Bowtie tries to minimize the + memory impact of the descriptors, but they can still grow very large in some cases. If + you receive an error message saying that chunk memory has been exhausted in --best mode, + try adjusting this parameter up to dedicate more memory to the descriptors. This value + is not relevant for Bowtie 2. Default: 512. + +-I/--minins The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and + a paired-end alignment consists of two 20-bp alignments in the appropriate orientation + with a 20-bp gap between them, that alignment is considered valid (as long as -X is also + satisfied). A 19-bp gap would not be valid in that case. Default: 0. + +-X/--maxins The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and + a paired-end alignment consists of two 20-bp alignments in the proper orientation with a + 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). + A 61-bp gap would not be valid in that case. Default: 500. + + +Bowtie 1 Reporting: + +-k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option + will be used by default. + +--best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum + (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in + terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred + quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both + have Phred quality 10. When --best is not specified, Bowtie may report alignments that + are sub-optimal in terms of stratum and/or quality (though an effort is made to report + the best alignment). --best mode also removes all strand bias. Note that --best does not + affect which alignments are considered "valid" by Bowtie, only which valid alignments + are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified. + Default: on. + +--no_best Disables the --best option which is on by default. This can speed up the alignment process, + e.g. for testing purposes, but for credible results it is not recommended to disable --best. + + +Output: + +--non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four + bisulfite strands will be reported. Default: OFF. + + (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary + to the original strands are merely theoretical and should not exist in reality. Specifying directional + alignments (which is the default) will only run 2 alignment threads to the original top (OT) + or bottom (OB) strands in parallel and report these alignments. This is the recommended option + for sprand-specific libraries). + +--pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al., + PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode, + as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT + and OB ones. Use this option only if you are certain that your libraries were constructed following + a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option + --pbat works only for single-end and paired-end FastQ files for use with Bowtie1 (uncompressed + temporary files only). + +--sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are + split up into several smaller files to run concurrently and the output files are to be merged. + +--quiet Print nothing besides alignments. + +--vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead + of SAM format output. + +-un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will + appear as they did in the input, without any translation of quality values that may have + taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 + and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads + with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) + are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. + +--ambiguous Write all reads which produce more than one valid alignment with the same number of lowest + mismatches or other reads that fail to align uniquely to a file in the output directory. + Written reads will appear as they did in the input, without any of the translation of quality + values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two + parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and + _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. + +-o/--output_dir Write all output files into this directory. By default the output files will be written into + the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt + to create it first. The path to the output folder can be either relative or absolute. + +--temp_dir Write temporary files to this directory instead of into the same directory as the input files. If + the specified folder does not exist, Bismark will attempt to create it first. The path to the + temporary folder can be either relative or absolute. + +--non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the + alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is + just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions. + The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches' + and 'XB:Z:number of mismatches' for read 2 of paired-end reads. + +--gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk + space. This option is available for most alignment modes but is not available for paired-end FastA + files. This option might be somewhat slower than writing out uncompressed files, but this awaits + further testing. + +--bam The output will be written out in BAM format instead of the default SAM format. Bismark will + attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't + been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found, + the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). + +--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified + explicitly if Samtools is in the PATH already. + +--prefix Prefixes to the output filenames. Trailing dots will be replaced by a single one. For + example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. + +--old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, + this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since + both the appended read IDs and custom FLAG values may cause problems with some downstream tools + such as Picard, new defaults were implemented as of version 0.8.3. + + + default old_flag + =================== =================== + Read 1 Read 2 Read 1 Read 2 + + OT: 99 147 67 131 + + OB: 83 163 115 179 + + CTOT: 99 147 67 131 + + CTOB: 83 163 115 179 + + + +Other: + +-h/--help Displays this help file. + +-v/--version Displays version information. + + +BOWTIE 2 SPECIFIC OPTIONS + +--bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end + alignments, i.e. searches for alignments involving all read characters (also called + untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter + and/or quality trimmed where appropriate. Default: off. + +Bowtie 2 alignment options: + +-N Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. + Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) + but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for + Bowtie 1 see -n). + +-L Sets the length of the seed substrings to align during multiseed alignment. Smaller values + make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is + used by default, which sets -L to 20. This option is only available for Bowtie 2 (for + Bowtie 1 see -l). + +--ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched + position to be the highest possible, regardless of the actual value. I.e. input is treated + as though all quality values are high. This is also the default behavior when the input + doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. + + +Bowtie 2 paired-end options: + +--no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if + it cannot find a concordant or discordant alignment for a pair. This option is invariable and + and on by default. + +--no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. + A discordant alignment is an alignment where both mates align uniquely, but that does not + satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior + and it is on by default. + + +Bowtie 2 effort options: + +-D Up to consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using + the alignments found so far. A seed extension "fails" if it does not yield a new best or a + new second-best alignment. Default: 15. + +-R is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. + When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of + mismatches allowed) at different offsets and searches for more alignments. A read is considered + to have repetitive seeds if the total number of seed hits divided by the number of seeds + that aligned at least once is greater than 300. Default: 2. + +Bowtie 2 parallelization options: + + +-p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores + and synchronize when parsing reads and outputting alignments. Searching for alignments is highly + parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. + E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint + by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads + library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will + automatically use the option '--reorder', which guarantees that output SAM records are printed in + an order corresponding to the order of the reads in the original input file, even when -p is set + greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and + setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then + if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally + correspond to input order in that case. + +Bowtie 2 Scoring options: + +--score_min Sets a function governing the minimum alignment score needed for an alignment to be considered + "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying + L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. + See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is + L,0,-0.2. + +--rdg , Sets the read gap open () and extend () penalties. A read gap of length N gets a penalty + of + N * . Default: 5, 3. + +--rfg , Sets the reference gap open () and extend () penalties. A reference gap of length N gets + a penalty of + N * . Default: 5, 3. + + +Bowtie 2 Reporting options: + +-most_valid_alignments This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is + deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the + default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the + effort expended to find valid alignments. + + For reference, this used to be the old (now deprecated) description of -M: + Bowtie 2 searches for at most +1 distinct, valid alignments for each read. The search terminates when it + can't find more distinct valid alignments, or when it finds +1 distinct alignments, whichever + happens first. Only the best alignment is reported. Information from the other alignments is used to + estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes + Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that + aligns many places. For reads that have more than +1 distinct, valid alignments, Bowtie 2 does not + guarantee that the alignment reported is the best possible in terms of alignment score. -M is + always used and its default value is set to 10. + + +'VANILLA' Bismark OUTPUT: + +Single-end output format (tab-separated): + + (1) + (2) + (3) + (4) + (5) + (6) + (7) + (8) + (9) +(11) + + +Paired-end output format (tab-separated): + (1) + (2) + (3) + (4) + (5) + (6) + (7) + (8) + (9) +(10) +(11) +(12) +(14) +(15) + + +Bismark SAM OUTPUT (default): + + (1) QNAME (seq-ID) + (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!)) + (3) RNAME (chromosome) + (4) POS (start position) + (5) MAPQ (always 255) + (6) CIGAR + (7) RNEXT + (8) PNEXT + (9) TLEN +(10) SEQ +(11) QUAL (Phred33 scale) +(12) NM-tag (edit distance to the reference) +(13) XX-tag (base-by-base mismatches to the reference. This does not include indels) +(14) XM-tag (methylation call string) +(15) XR-tag (read conversion state for the alignment) +(16) XG-tag (genome conversion state for the alignment) +(17) XA/XB-tag (non-bisulfite mismatches) (optional!) + +Each read of paired-end alignments is written out in a separate line in the above format. + + +Last edited on 07 October 2013. + +HOW_TO +} diff -r 0f8646f22b8d -r fcadce4d9a06 old/bismark_genome_preparation --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/old/bismark_genome_preparation Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,459 @@ +#!/usr/bin/perl -- +use strict; +use warnings; +use Cwd; +use File::Path qw(rmtree); +$|++; + + +## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +use Getopt::Long; +use Cwd; + +my $verbose; +my $help; +my $version; +my $man; +my $path_to_bowtie; +my $multi_fasta; +my $single_fasta; +my $bowtie2; + +my $bismark_version = 'v0.10.0'; + +GetOptions ('verbose' => \$verbose, + 'help' => \$help, + 'man' => \$man, + 'version' => \$version, + 'path_to_bowtie:s' => \$path_to_bowtie, + 'single_fasta' => \$single_fasta, + 'bowtie2' => \$bowtie2, + ); + +if ($help or $man){ + print_helpfile(); + exit; +} + +if ($version){ + print << "VERSION"; + + Bismark - Bisulfite Mapper and Methylation Caller. + + Bismark Genome Preparation Version: $bismark_version + Copyright 2010-13 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/ + +VERSION + exit; +} + +my $genome_folder = shift @ARGV; # mandatory + +# Ensuring a genome folder has been specified +if ($genome_folder){ + unless ($genome_folder =~ /\/$/){ + $genome_folder =~ s/$/\//; + } + $verbose and print "Path to genome folder specified as: $genome_folder\n"; + chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; + + # making the genome folder path abolsolute so it won't break if the path was specified relative + $genome_folder = getcwd; + unless ($genome_folder =~ /\/$/){ + $genome_folder =~ s/$/\//; + } +} +else{ + die "Please specify a genome folder to be used for bisulfite conversion\n\n"; +} + + +my $CT_dir; +my $GA_dir; + + +if ($single_fasta){ + print "Writing individual genomes out into single-entry fasta files (one per chromosome)\n\n"; + $multi_fasta = 0; +} +else{ + print "Writing bisulfite genomes out into a single MFA (multi FastA) file\n\n"; + $single_fasta = 0; + $multi_fasta = 1; +} + +my @filenames = create_bisulfite_genome_folders(); + +process_sequence_files (); + +launch_bowtie_indexer(); + +sub launch_bowtie_indexer{ + if ($bowtie2){ + print "Bismark Genome Preparation - Step III: Launching the Bowtie 2 indexer\n"; + } + else{ + print "Bismark Genome Preparation - Step III: Launching the Bowtie (1) indexer\n"; + } + print "Please be aware that this process can - depending on genome size - take up to several hours!\n"; + sleep(5); + + ### if the path to bowtie was specfified explicitely + if ($path_to_bowtie){ + if ($bowtie2){ + $path_to_bowtie =~ s/$/bowtie2-build/; + } + else{ + $path_to_bowtie =~ s/$/bowtie-build/; + } + } + ### otherwise we assume that bowtie-build is in the path + else{ + if ($bowtie2){ + $path_to_bowtie = 'bowtie2-build'; + } + else{ + $path_to_bowtie = 'bowtie-build'; + } + } + + $verbose and print "\n"; + + ### Forking the program to run 2 instances of Bowtie-build or Bowtie2-build (= the Bowtie (1/2) indexer) + my $pid = fork(); + + # parent process + if ($pid){ + sleep(1); + chdir $CT_dir or die "Unable to change directory: $!\n"; + $verbose and warn "Preparing indexing of CT converted genome in $CT_dir\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "Parent process: Starting to index C->T converted genome with the following command:\n\n"; + $verbose and print "$path_to_bowtie -f $file_list BS_CT\n\n"; + + sleep (11); + exec ("$path_to_bowtie","-f","$file_list","BS_CT"); + } + + # child process + elsif ($pid == 0){ + sleep(2); + chdir $GA_dir or die "Unable to change directory: $!\n"; + $verbose and warn "Preparing indexing of GA converted genome in $GA_dir\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "Child process: Starting to index G->A converted genome with the following command:\n\n"; + $verbose and print "$path_to_bowtie -f $file_list BS_GA\n\n"; + $verbose and print "(starting in 10 seconds)\n"; + sleep(10); + exec ("$path_to_bowtie","-f","$file_list","BS_GA"); + } + + # if the platform doesn't support the fork command we will run the indexing processes one after the other + else{ + print "Forking process was not successful, therefore performing the indexing sequentially instead\n"; + sleep(10); + + ### moving to CT genome folder + $verbose and warn "Preparing to index CT converted genome in $CT_dir\n"; + chdir $CT_dir or die "Unable to change directory: $!\n"; + my @fasta_files = <*.fa>; + my $file_list = join (',',@fasta_files); + $verbose and print "$file_list\n\n"; + sleep(2); + system ("$path_to_bowtie","-f","$file_list","BS_CT"); + @fasta_files=(); + $file_list= ''; + + ### moving to GA genome folder + $verbose and warn "Preparing to index GA converted genome in $GA_dir\n"; + chdir $GA_dir or die "Unable to change directory: $!\n"; + @fasta_files = <*.fa>; + $file_list = join (',',@fasta_files); + $verbose and print "$file_list\n\n"; + sleep(2); + exec ("$path_to_bowtie","-f","$file_list","BS_GA"); + } +} + + +sub process_sequence_files { + + my ($total_CT_conversions,$total_GA_conversions) = (0,0); + $verbose and print "Bismark Genome Preparation - Step II: Bisulfite converting reference genome\n\n"; + sleep (3); + + $verbose and print "conversions performed:\n"; + $verbose and print join("\t",'chromosome','C->T','G->A'),"\n"; + + + ### If someone wants to index a genome which consists of thousands of contig and scaffold files we need to write the genome conversions into an MFA file + ### Otherwise the list of comma separated chromosomes we provide for bowtie-build will get too long for the kernel to handle + ### This is now the default option + + if ($multi_fasta){ + ### Here we just use one multi FastA file name, append .CT_conversion or .GA_conversion and print all sequence conversions into these files + my $bisulfite_CT_conversion_filename = "$CT_dir/genome_mfa.CT_conversion.fa"; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/genome_mfa.GA_conversion.fa"; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + foreach my $filename(@filenames){ + my ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); + open (IN,$filename) or die "Failed to read from sequence file $filename $!\n"; + # warn "Reading chromosome information from $filename\n\n"; + + ### first line needs to be a fastA header + my $first_line = ; + chomp $first_line; + + ### Extracting chromosome name from the FastA header + my $chromosome_name = extract_chromosome_name($first_line); + + ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. + unless ($multi_fasta){ + my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; + $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; + $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; # first entry + print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; # first entry + + + while (){ + + ### in case the line is a new fastA header + if ($_ =~ /^>/){ + ### printing out the stats for the previous chromosome + $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; + ### resetting the chromosome transliteration counters + ($chromosome_CT_conversions,$chromosome_GA_conversions) = (0,0); + + ### Extracting chromosome name from the additional FastA header + $chromosome_name = extract_chromosome_name($_); + + ### alternatively, chromosomes can be written out into single-entry FastA files. This will only work for genomes with up to a few hundred chromosomes. + unless ($multi_fasta){ + my $bisulfite_CT_conversion_filename = "$CT_dir/$chromosome_name"; + $bisulfite_CT_conversion_filename =~ s/$/.CT_conversion.fa/; + open (CT_CONVERT,'>',$bisulfite_CT_conversion_filename) or die "Can't write to file $bisulfite_CT_conversion_filename: $!\n"; + + my $bisulfite_GA_conversion_filename = "$GA_dir/$chromosome_name"; + $bisulfite_GA_conversion_filename =~ s/$/.GA_conversion.fa/; + open (GA_CONVERT,'>',$bisulfite_GA_conversion_filename) or die "Can't write to file $bisulfite_GA_conversion_filename: $!\n"; + } + + print CT_CONVERT ">",$chromosome_name,"_CT_converted\n"; + print GA_CONVERT ">",$chromosome_name,"_GA_converted\n"; + } + + else{ + my $sequence = uc$_; + + ### (I) First replacing all ambiguous sequence characters (such as M,S,R....) by N (G,A,T,C,N and the line endings \r and \n are added to a character group) + + $sequence =~ s/[^ATCGN\n\r]/N/g; + + ### (II) Writing the chromosome out into a C->T converted version (equals forward strand conversion) + + my $CT_sequence = $sequence; + my $CT_transliterations_performed = ($CT_sequence =~ tr/C/T/); # converts all Cs into Ts + $total_CT_conversions += $CT_transliterations_performed; + $chromosome_CT_conversions += $CT_transliterations_performed; + + print CT_CONVERT $CT_sequence; + + ### (III) Writing the chromosome out in a G->A converted version of the forward strand (this is equivalent to reverse- + ### complementing the forward strand and then C->T converting it) + + my $GA_sequence = $sequence; + my $GA_transliterations_performed = ($GA_sequence =~ tr/G/A/); # converts all Gs to As on the forward strand + $total_GA_conversions += $GA_transliterations_performed; + $chromosome_GA_conversions += $GA_transliterations_performed; + + print GA_CONVERT $GA_sequence; + + } + } + $verbose and print join ("\t",$chromosome_name,$chromosome_CT_conversions,$chromosome_GA_conversions),"\n"; + } + close (CT_CONVERT) or die "Failed to close filehandle: $!\n"; + close (GA_CONVERT) or die "Failed to close filehandle: $!\n"; + + + print "\nTotal number of conversions performed:\n"; + print "C->T:\t$total_CT_conversions\n"; + print "G->A:\t$total_GA_conversions\n"; + + warn "\nStep II - Genome bisulfite conversions - completed\n\n\n"; +} + +sub extract_chromosome_name { + + my $header = shift; + + ## Bowtie extracts the first string after the initial > in the FASTA file, so we are doing this as well + + if ($header =~ s/^>//){ + my ($chromosome_name) = split (/\s+/,$header); + return $chromosome_name; + } + else{ + die "The specified chromosome file doesn't seem to be in FASTA format as required! $!\n"; + } +} + +sub create_bisulfite_genome_folders{ + + $verbose and print "Bismark Genome Preparation - Step I: Preparing folders\n\n"; + + if ($path_to_bowtie){ + unless ($path_to_bowtie =~ /\/$/){ + $path_to_bowtie =~ s/$/\//; + } + if (chdir $path_to_bowtie){ + if ($bowtie2){ + $verbose and print "Path to Bowtie 2 specified: $path_to_bowtie\n"; + } + else{ + $verbose and print "Path to Bowtie (1) specified: $path_to_bowtie\n"; + } + } + else{ + die "There was an error with the path to bowtie: $!\n"; + } + } + + chdir $genome_folder or die "Could't move to directory $genome_folder. Make sure the directory exists! $!"; + + + # Exiting unless there are fastA files in the folder + my @filenames = <*.fa>; + + ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta + unless (@filenames){ + @filenames = <*.fasta>; + } + + unless (@filenames){ + die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions\n"; + } + + warn "Bisulfite Genome Indexer version $bismark_version (last modified 19 Sept 2013)\n\n"; + sleep (3); + + # creating a directory inside the genome folder to store the bisfulfite genomes unless it already exists + my $bisulfite_dir = "${genome_folder}Bisulfite_Genome/"; + unless (-d $bisulfite_dir){ + mkdir $bisulfite_dir or die "Unable to create directory $bisulfite_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $bisulfite_dir\n"; + } + else{ + print "\nA directory called $bisulfite_dir already exists. Bisulfite converted sequences and/or already existing Bowtie (1 or 2) indices will be overwritten!\n\n"; + sleep(5); + } + + chdir $bisulfite_dir or die "Unable to move to $bisulfite_dir\n"; + $CT_dir = "${bisulfite_dir}CT_conversion/"; + $GA_dir = "${bisulfite_dir}GA_conversion/"; + + # creating 2 subdirectories to store a C->T (forward strand conversion) and a G->A (reverse strand conversion) + # converted version of the genome + unless (-d $CT_dir){ + mkdir $CT_dir or die "Unable to create directory $CT_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $CT_dir\n"; + } + unless (-d $GA_dir){ + mkdir $GA_dir or die "Unable to create directory $GA_dir $!\n"; + $verbose and print "Created Bisulfite Genome folder $GA_dir\n"; + } + + # moving back to the original genome folder + chdir $genome_folder or die "Could't move to directory $genome_folder $!"; + # $verbose and print "Moved back to genome folder folder $genome_folder\n"; + warn "\nStep I - Prepare genome folders - completed\n\n\n"; + return @filenames; +} + +sub print_helpfile{ + print << 'HOW_TO'; + + +DESCRIPTION + +This script is supposed to convert a specified reference genome into two different bisulfite +converted versions and index them for alignments with Bowtie 1 (default), or Bowtie 2. The first +bisulfite genome will have all Cs converted to Ts (C->T), and the other one will have all Gs +converted to As (G->A). Both bisulfite genomes will be stored in subfolders within the reference +genome folder. Once the bisulfite conversion has been completed the program will fork and launch +two simultaneous instances of the Bowtie 1 or 2 indexer (bowtie-build or bowtie2-build). Be aware +that the indexing process can take up to several hours; this will mainly depend on genome size +and system resources. + + + +The following is a brief description of command line options and arguments to control the +Bismark Genome Preparation: + + +USAGE: bismark_genome_preparation [options] + + +OPTIONS: + +--help/--man Displays this help filea and exits. + +--version Displays version information and exits. + +--verbose Print verbose output for more details or debugging. + +--path_to_bowtie The full path to the Bowtie 1 or Bowtie 2 installation on your system + (depending on which aligner/indexer you intend to use). Unless this path + is specified it is assumed that Bowtie is in the PATH. + +--bowtie2 This will create bisulfite indexes for Bowtie 2. (Default: Bowtie 1). + +--single_fasta Instruct the Bismark Indexer to write the converted genomes into + single-entry FastA files instead of making one multi-FastA file (MFA) + per chromosome. This might be useful if individual bisulfite converted + chromosomes are needed (e.g. for debugging), however it can cause a + problem with indexing if the number of chromosomes is vast (this is likely + to be in the range of several thousand files; the operating system can + only handle lists up to a certain length, and some newly assembled + genomes may contain 20000-50000 contigs of scaffold files which do exceed + this list length limit). + + +ARGUMENTS: + + The path to the folder containing the genome to be bisulfite converted. + The Bismark Genome Preparation expects one or more fastA files in the folder + (with the file extension: .fa or .fasta). Specifying this path is mandatory. + + +This script was last modified on 19 Sept 2013. +HOW_TO +} diff -r 0f8646f22b8d -r fcadce4d9a06 old/bismark_methylation_extractor --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/old/bismark_methylation_extractor Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,4760 @@ +#!/usr/bin/perl +use warnings; +use strict; +$|++; +use Getopt::Long; +use Cwd; +use Carp; +use FindBin qw($Bin); +use lib "$Bin/../lib"; + + +## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) + +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. + +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. + +## You should have received a copy of the GNU General Public License +## along with this program. If not, see . + +my @filenames; # input files +my %counting; +my $parent_dir = getcwd(); + +my %fhs; + +my $version = 'v0.10.1'; +my ($ignore,$genomic_fasta,$single,$paired,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem) = process_commandline(); + + +### only needed for bedGraph output +my @sorting_files; # if files are to be written to bedGraph format, these are the methylation extractor output files +my @methylcalls = qw (0 0 0); # [0] = methylated, [1] = unmethylated, [2] = total +my @bedfiles; + +### only needed for genome-wide cytosine methylation report +my %chromosomes; + +my %mbias_1; +my %mbias_2; + +############################################################################################## +### Summarising Run Parameters +############################################################################################## + +### METHYLATION EXTRACTOR + +warn "Summarising Bismark methylation extractor parameters:\n"; +warn '='x63,"\n"; + +if ($single){ + if ($vanilla){ + warn "Bismark single-end vanilla format specified\n"; + } + else{ + warn "Bismark single-end SAM format specified (default)\n"; # default + } +} +elsif ($paired){ + if ($vanilla){ + warn "Bismark paired-end vanilla format specified\n"; + } + else{ + warn "Bismark paired-end SAM format specified (default)\n"; # default + } +} + +if ($single){ + if ($ignore){ + warn "First $ignore bp will be disregarded when processing the methylation call string\n"; + } +} +else{ ## paired-end + if ($ignore){ + warn "First $ignore bp will be disregarded when processing the methylation call string of Read 1\n"; + } + if ($ignore_r2){ + warn "First $ignore_r2 bp will be disregarded when processing the methylation call string of Read 2\n"; + } +} + + +if ($full){ + warn "Strand-specific outputs will be skipped. Separate output files for cytosines in CpG, CHG and CHH context will be generated\n"; +} +if ($merge_non_CpG){ + warn "Merge CHG and CHH context to non-CpG context specified\n"; +} +### output directory +if ($output_dir eq ''){ + warn "Output will be written to the current directory ('$parent_dir')\n"; +} +else{ + warn "Output path specified as: $output_dir\n"; +} + + +sleep (1); + +### BEDGRAPH + +if ($bedGraph){ + warn "\n\nSummarising bedGraph parameters:\n"; + warn '='x63,"\n"; + + if ($counts){ + warn "Generating additional output in bedGraph and coverage format\nbedGraph format:\t \ncoverage format:\t \n\n"; + } + else{ + warn "Generating additional sorted output in bedGraph format (output format: )\n"; + } + + warn "Using a cutoff of $coverage_threshold read(s) to report cytosine positions\n"; + + if ($CX_context){ + warn "Reporting and sorting methylation information for all cytosine context (sorting may take a long time, you have been warned ...)\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting and sorting cytosine methylation information in CpG context only (default)\n"; + } + + if ($remove){ + warn "White spaces in read ID names will be removed prior to sorting\n"; + } + + if ($ample_mem){ + warn "Sorting chromosomal postions for the bedGraph step using arrays instead of using UNIX sort\n"; + } + elsif (defined $sort_size){ + warn "The bedGraph UNIX sort command will use the following memory setting:\t'$sort_size'. Temporary directory used for sorting is the output directory\n"; + } + else{ + warn "Setting a default memory usage for the bedGraph UNIX sort command to 2GB\n"; + } + + + + sleep (1); + + if ($cytosine_report){ + warn "\n\nSummarising genome-wide cytosine methylation report parameters:\n"; + warn '='x63,"\n"; + warn "Generating comprehensive genome-wide cytosine report\n(output format: )\n"; + + + if ($CX_context){ + warn "Reporting methylation for all cytosine contexts. Be aware that this will generate enormous files\n"; + } + else{ # default + $CpG_only = 1; + warn "Reporting cytosine methylation in CpG context only (default)\n"; + } + + if ($split_by_chromosome){ + warn "Splitting the cytosine report output up into individual files for each chromosome\n"; + } + + ### Zero-based coordinates + if ($zero){ + warn "Using zero-based genomic coordinates (user-defined)\n"; + } + else{ # default, 1-based coords + warn "Using 1-based genomic coordinates (default)\n"; + } + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + warn "Genome folder was specified as $genome_folder\n"; + } + else{ + $genome_folder = '/data/public/Genomes/Mouse/NCBIM37/'; + warn "Using the default genome folder /data/public/Genomes/Mouse/NCBIM37/\n"; + } + sleep (1); + } +} + +warn "\n"; +sleep (5); + +###################################################### +### PROCESSING FILES +###################################################### + +foreach my $filename (@filenames){ + # resetting counters and filehandles + %fhs = (); + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + ); + + @sorting_files = (); + @bedfiles = (); + + %mbias_1 = (); + %mbias_2 = (); + + ### performing a quick check to see if a paired-end SAM file has been sorted by positions which does interfere with the logic used by the extractor + unless ($vanilla){ + if ($paired){ + test_positional_sorting($filename); + } + } + + process_Bismark_results_file($filename); + + ### Closing all filehandles so that the Bismark methylation extractor output doesn't get truncated due to buffering issues + foreach my $fh (keys %fhs) { + if ($fh =~ /^[1230]$/) { + foreach my $context (keys %{$fhs{$fh}}) { + close $fhs{$fh}->{$context} or die $!; + } + } + else{ + close $fhs{$fh} or die $!; + } + } + + ### printing out all M-Bias data + produce_mbias_plots ($filename); + + delete_unused_files(); + + if ($bedGraph){ + + my $out = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $out =~ s/gz$//; + $out =~ s/sam$//; + $out =~ s/bam$//; + $out =~ s/txt$//; + $out =~ s/$/bedGraph/; + + my $bedGraph_output = $out; + my @args; + + if ($remove){ + push @args, '--remove'; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($no_header){ + push @args, '--no_header'; + } + if ($gazillion){ + push @args, '--gazillion'; + } + if ($ample_mem){ + push @args, '--ample_memory'; + } + + + # if ($counts){ + # push @args, "--counts"; + # } + + push @args, "--buffer_size $sort_size"; + push @args, "--cutoff $coverage_threshold"; + push @args, "--output $bedGraph_output"; + push @args, "--dir '$output_dir'"; + + ### adding all files to be sorted to @args + foreach my $f (@sorting_files){ + push @args, $f; + } + + # print join "\t",@args,"\n"; + + system ("$Bin/bismark2bedGraph @args"); + + warn "Finished BedGraph conversion ...\n\n"; + sleep(3); + + # open (OUT,'>',$output_dir.$bedGraph_output) or die "Problems with the bedGraph output filename detected: file path: '$output_dir'\tfile name: '$bedGraph_output' $!"; + # warn "Writing bedGraph to file: $bedGraph_output\n"; + # process_bedGraph_output(); + # close OUT or die $!; + + ### genome-wide cytosine methylation report requires bedGraph processing anyway + if ($cytosine_report){ + + @args = (); # resetting @args + my $cytosine_out = $out; + $cytosine_out =~ s/bedGraph$//; + + if ($CX_context){ + $cytosine_out =~ s/$/CX_report.txt/; + } + else{ + $cytosine_out =~ s/$/CpG_report.txt/; + } + + push @args, "--output $cytosine_out"; + push @args, "--dir '$output_dir'"; + push @args, "--genome '$genome_folder'"; + push @args, "--parent_dir '$parent_dir'"; + + if ($zero){ + push @args, "--zero"; + } + if ($CX_context){ + push @args, '--CX_context'; + } + if ($split_by_chromosome){ + push @args, '--split_by_chromosome'; + } + + my $coverage_output = $bedGraph_output; + $coverage_output =~ s/bedGraph$/bismark.cov/; + + push @args, $output_dir . $coverage_output; # this will be the infile + + system ("$Bin/coverage2cytosine @args"); + # generate_genome_wide_cytosine_report($bedGraph_output,$cytosine_out); + warn "\n\nFinished generating genome-wide cytosine report\n\n"; + } + } +} + +sub delete_unused_files{ + + warn "Deleting unused files ...\n\n"; sleep(1); + + my $index = 0; + + while ($index <= $#sorting_files){ + if ($sorting_files[$index] =~ /gz$/){ + open (USED,"zcat $sorting_files[$index] |") or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; + } + else{ + open (USED,$sorting_files[$index]) or die "Failed to read from methylation extractor output file $sorting_files[$index]: $!\n"; + } + + my $used = 0; + + while (){ + next if (/^Bismark/); + if ($_){ + $used = 1; + last; + } + } + + if ($used){ + warn "$sorting_files[$index] contains data ->\tkept\n"; + ++$index; + } + else{ + + my $delete = unlink $sorting_files[$index]; + + if ($delete){ + warn "$sorting_files[$index] was empty ->\tdeleted\n"; + } + else{ + warn "$sorting_files[$index] was empty, however deletion was unsuccessful: $!\n" + } + + ### we also need to remove the element from @sorting_files + splice @sorting_files, $index, 1; + } + } + warn "\n\n"; ## can't close the piped filehandles at this point because it will die (unfortunately) +} + +sub produce_mbias_plots{ + + my $filename = shift; + + my $mbias = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $mbias =~ s/gz$//; + $mbias =~ s/sam$//; + $mbias =~ s/bam$//; + $mbias =~ s/txt$//; + my $mbias_graph_1 = my $mbias_graph_2 = $mbias; + $mbias_graph_1 = $output_dir . $mbias_graph_1 . 'M-bias_R1.png'; + $mbias_graph_2 = $output_dir . $mbias_graph_2 . 'M-bias_R2.png'; + + $mbias =~ s/$/M-bias.txt/; + + open (MBIAS,'>',"$output_dir$mbias") or die "Failed to open file for the M-bias data\n\n"; + + # determining maximum read length + my $max_length_1 = 0; + my $max_length_2 = 0; + + foreach my $context (keys %mbias_1){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_1{$context}}){ + $max_length_1 = $pos unless ($max_length_1 >= $pos); + } + } + if ($paired){ + foreach my $context (keys %mbias_2){ + foreach my $pos (sort {$a<=>$b} keys %{$mbias_2{$context}}){ + $max_length_2 = $pos unless ($max_length_2 >= $pos); + } + } + } + + if ($single){ + warn "Determining maximum read length for M-Bias plot\n"; + warn "Maximum read length of Read 1: $max_length_1\n\n"; + } + else{ + warn "Determining maximum read lengths for M-Bias plots\n"; + warn "Maximum read length of Read 1: $max_length_1\n"; + warn "Maximum read length of Read 2: $max_length_2\n\n"; + } + # sleep(3); + + my @mbias_read1; + my @mbias_read2; + + #Check whether the module GD::Graph:lines is installed + my $gd_graph_installed = 0; + eval{ + require GD::Graph::lines; + GD::Graph::lines->import(); + }; + + unless($@) { # syntax or routine error variable, set if something goes wron in the last eval{ require ...} + $gd_graph_installed = 1; + + #Check whether the module GD::Graph::colour is installed + eval{ + require GD::Graph::colour; + GD::Graph::colour->import(qw(:colours :lists :files :convert)); + }; + + if ($@) { + warn "Perl module GD::Graph::colour not found, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; + sleep(2); + $gd_graph_installed = 0; + } + + + } + else{ + warn "Perl module GD::Graph::lines is not installed, skipping drawing M-bias plots (only writing out M-bias plot table)\n"; + sleep(2); + } + + + my $graph_title; + my $graph1; + my $graph2; + + if ( $gd_graph_installed){ + $graph1 = GD::Graph::lines->new(800,600); + if ($paired){ + $graph2 = GD::Graph::lines->new(800,600); + } + } + + foreach my $context (qw(CpG CHG CHH)){ + @{$mbias_read1[0]} = (); + + if ($paired){ + print MBIAS "$context context (R1)\n================\n"; + $graph_title = 'M-bias (Read 1)'; + } + else{ + print MBIAS "$context context\n===========\n"; + $graph_title = 'M-bias'; + } + print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + + foreach my $pos (1..$max_length_1){ + + unless (defined $mbias_1{$context}->{$pos}->{meth}){ + $mbias_1{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_1{$context}->{$pos}->{un}){ + $mbias_1{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_1{$context}->{$pos}->{meth} * 100/ ( $mbias_1{$context}->{$pos}->{meth} + $mbias_1{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_1{$context}->{$pos}->{un} + $mbias_1{$context}->{$pos}->{meth}; + + print MBIAS "$pos\t$mbias_1{$context}->{$pos}->{meth}\t$mbias_1{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + push @{$mbias_read1[0]},$pos; + + if ($context eq 'CpG'){ + push @{$mbias_read1[1]},$percent; + push @{$mbias_read1[4]},$coverage; + } + elsif ($context eq 'CHG'){ + push @{$mbias_read1[2]},$percent; + push @{$mbias_read1[5]},$coverage; + } + elsif ($context eq 'CHH'){ + push @{$mbias_read1[3]},$percent; + push @{$mbias_read1[6]},$coverage; + } + } + print MBIAS "\n"; + } + + if ( $gd_graph_installed){ + + add_colour(nice_blue => [31,120,180]); + add_colour(nice_orange => [255,127,0]); + add_colour(nice_green => [51,160,44]); + add_colour(pale_blue => [153,206,227]); + add_colour(pale_orange => [253,204,138]); + add_colour(pale_green => [191,230,207]); + + $graph1->set( + x_label => 'position (bp)', + y1_label => '% methylation', + y2_label => '# methylation calls', + title => $graph_title, + line_width => 2, + x_max_value => $max_length_1, + x_min_value => 0, + y_tick_number => 10, + y_label_skip => 2, + y1_max_value => 100, + y1_min_value => 0, + y_label_skip => 2, + y2_min_value => 0, + x_label_skip => 5, + x_label_position => 0.5, + x_tick_offset => -1, + bgclr => 'white', + transparent => 0, + two_axes => 1, + use_axis => [1,1,1,2,2,2], + legend_placement => 'RC', + legend_spacing => 6, + legend_marker_width => 24, + legend_marker_height => 18, + dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], + ) or die $graph1->error; + + $graph1->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); + + my $gd1 = $graph1->plot(\@mbias_read1) or die $graph1->error; + + open (MBIAS_G1,'>',$mbias_graph_1) or die "Failed to write to file for M-bias plot 1: $!\n\n"; + binmode MBIAS_G1; + print MBIAS_G1 $gd1->png; + } + + if ($paired){ + + foreach my $context (qw(CpG CHG CHH)){ + @{$mbias_read2[0]} = (); + + print MBIAS "$context context (R2)\n================\n"; + print MBIAS "position\tcount methylated\tcount unmethylated\t% methylation\tcoverage\n"; + foreach my $pos (1..$max_length_2){ + + unless (defined $mbias_2{$context}->{$pos}->{meth}){ + $mbias_2{$context}->{$pos}->{meth} = 0; + } + unless (defined $mbias_2{$context}->{$pos}->{un}){ + $mbias_2{$context}->{$pos}->{un} = 0; + } + + my $percent = ''; + if (($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) > 0){ + $percent = sprintf("%.2f",$mbias_2{$context}->{$pos}->{meth} * 100/ ($mbias_2{$context}->{$pos}->{meth} + $mbias_2{$context}->{$pos}->{un}) ); + } + my $coverage = $mbias_2{$context}->{$pos}->{un} + $mbias_2{$context}->{$pos}->{meth}; + + print MBIAS "$pos\t$mbias_2{$context}->{$pos}->{meth}\t$mbias_2{$context}->{$pos}->{un}\t$percent\t$coverage\n"; + + push @{$mbias_read2[0]},$pos; + + if ($context eq 'CpG'){ + push @{$mbias_read2[1]},$percent; + push @{$mbias_read2[4]},$coverage; + } + elsif ($context eq 'CHG'){ + push @{$mbias_read2[2]},$percent; + push @{$mbias_read2[5]},$coverage; + } + elsif ($context eq 'CHH'){ + push @{$mbias_read2[3]},$percent; + push @{$mbias_read2[6]},$coverage; + } + } + print MBIAS "\n"; + } + + if ( $gd_graph_installed){ + + add_colour(nice_blue => [31,120,180]); + add_colour(nice_orange => [255,127,0]); + add_colour(nice_green => [51,160,44]); + add_colour(pale_blue => [153,206,227]); + add_colour(pale_orange => [253,204,138]); + add_colour(pale_green => [191,230,207]); + + $graph2->set( + x_label => 'position (bp)', + line_width => 2, + x_max_value => $max_length_1, + x_min_value => 0, + y_tick_number => 10, + y_label_skip => 2, + y1_max_value => 100, + y1_min_value => 0, + y_label_skip => 2, + y2_min_value => 0, + x_label_skip => 5, + x_label_position => 0.5, + x_tick_offset => -1, + bgclr => 'white', + transparent => 0, + two_axes => 1, + use_axis => [1,1,1,2,2,2], + legend_placement => 'RC', + legend_spacing => 6, + legend_marker_width => 24, + legend_marker_height => 18, + dclrs => [ qw(nice_blue nice_orange nice_green pale_blue pale_orange pale_green)], + x_label => 'position (bp)', + y1_label => '% methylation', + y2_label => '# calls', + title => 'M-bias (Read 2)', + ) or die $graph2->error; + + $graph2->set_legend('CpG methylation','CHG methylation','CHH methylation','CpG total calls','CHG total calls','CHH total calls'); + my $gd2 = $graph2->plot(\@mbias_read2) or die $graph2->error; + + open (MBIAS_G2,'>',$mbias_graph_2) or die "Failed to write to file for M-bias plot 2: $!\n\n"; + binmode MBIAS_G2; + print MBIAS_G2 $gd2->png; + + } + } +} + +sub process_commandline{ + my $help; + my $single_end; + my $paired_end; + my $ignore; + my $ignore_r2; + my $genomic_fasta; + my $full; + my $report; + my $extractor_version; + my $no_overlap; + my $merge_non_CpG; + my $vanilla; + my $output_dir; + my $no_header; + my $bedGraph; + my $coverage_threshold = 1; # Minimum number of reads covering before calling methylation status + my $remove; + my $counts; + my $cytosine_report; + my $genome_folder; + my $zero; + my $CpG_only; + my $CX_context; + my $split_by_chromosome; + my $sort_size; + my $samtools_path; + my $gzip; + my $mbias_only; + my $gazillion; + my $ample_mem; + + my $command_line = GetOptions ('help|man' => \$help, + 'p|paired-end' => \$paired_end, + 's|single-end' => \$single_end, + 'fasta' => \$genomic_fasta, + 'ignore=i' => \$ignore, + 'ignore_r2=i' => \$ignore_r2, + 'comprehensive' => \$full, + 'report' => \$report, + 'version' => \$extractor_version, + 'no_overlap' => \$no_overlap, + 'merge_non_CpG' => \$merge_non_CpG, + 'vanilla' => \$vanilla, + 'o|output=s' => \$output_dir, + 'no_header' => \$no_header, + 'bedGraph' => \$bedGraph, + "cutoff=i" => \$coverage_threshold, + "remove_spaces" => \$remove, + "counts" => \$counts, + "cytosine_report" => \$cytosine_report, + 'g|genome_folder=s' => \$genome_folder, + "zero_based" => \$zero, + "CX|CX_context" => \$CX_context, + "split_by_chromosome" => \$split_by_chromosome, + "buffer_size=s" => \$sort_size, + 'samtools_path=s' => \$samtools_path, + "gzip" => \$gzip, + "mbias_only" => \$mbias_only, + "gazillion|scaffolds" => \$gazillion, + "ample_memory" => \$ample_mem, + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + ### HELPFILE + if ($help){ + print_helpfile(); + exit; + } + + if ($extractor_version){ + print << "VERSION"; + + + Bismark Methylation Extractor + + Bismark Extractor Version: $version + Copyright 2010-13 Felix Krueger, Babraham Bioinformatics + www.bioinformatics.babraham.ac.uk/projects/bismark/ + + +VERSION + exit; + } + + + ### no files provided + unless (@ARGV){ + die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; + } + @filenames = @ARGV; + + warn "\n *** Bismark methylation extractor version $version ***\n\n"; + + ### M-BIAS ONLY + if ($mbias_only){ + if ($bedGraph){ + die "Option '--mbias_only' skips all sorts of methylation extraction, including the bedGraph generation. Please respecify!\n"; + } + if ($cytosine_report){ + die "Option '--mbias_only' skips all sorts of methylation extraction, including the genome-wide cytosine methylation report generation. Please respecify!\n"; + } + if ($merge_non_CpG){ + warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--merge' won't have any effect\n"; + } + if ($full){ + warn "Option '--mbias_only' skips all sorts of methylation extraction, thus '--comprehensive' won't have any effect\n"; + } + sleep(3); + } + + ### PRINT A REPORT + unless ($report){ + $report = 0; + } + + ### OUTPUT DIR PATH + if ($output_dir){ + unless ($output_dir =~ /\/$/){ + $output_dir =~ s/$/\//; + } + } + else{ + $output_dir = ''; + } + + ### NO HEADER + unless ($no_header){ + $no_header = 0; + } + + ### OLD (VANILLA) OUTPUT FORMAT + unless ($vanilla){ + $vanilla = 0; + } + + if ($single_end){ + $paired_end = 0; ### SINGLE END ALIGNMENTS + } + elsif ($paired_end){ + $single_end = 0; ### PAIRED-END ALIGNMENTS + } + else{ + + ### we will try to determine whether the input file was a single-end or paired-end sequencing run from the SAM header + + if ($vanilla){ + die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format with '-s' or '-p'\n\n"; + } + else{ # SAM/BAM format + + my $file = $filenames[0]; + warn "Trying to determine the type of mapping from the SAM header line of file $file\n"; sleep(1); + + ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file + if ($file =~ /\.gz$/){ + open (DETERMINE,"zcat $file |") or die "Unable to read from gzipped file $file: $!\n"; + } + elsif ($file =~ /\.bam$/ || `file -b $file` =~ /^gzip/){ + open (DETERMINE,"samtools view -h $file |") or die "Unable to read from BAM file $file: $!\n"; + } + else{ + open (DETERMINE,$file) or die "Unable to read from $file: $!\n"; + } + + while (){ + last unless (/^\@/); + if ($_ =~ /^\@PG/){ + # warn "found the \@PG line:\n"; + # warn "$_"; + + if ($_ =~ /-1/ and $_ =~ /-2/){ + warn "Treating file(s) as paired-end data (as extracted from \@PG line)\n\n"; sleep(1); + $paired_end = 1; + $single_end = 0; + } + else{ + warn "Treating file(s) as single-end data (as extracted from \@PG line)\n\n"; sleep(1); + $paired_end = 0; + $single_end = 1; + } + } + } + + close DETERMINE or warn $!; + + } + } + + ### IGNORING bases at the start of the read when processing the methylation call string + unless ($ignore){ + $ignore = 0; + } + + if (defined $ignore_r2){ + die "You can only specify --ignore_r2 for paired-end result files\n" unless ($paired_end); + } + else{ + $ignore_r2 = 0; + } + + + ### NO OVERLAP + if ($no_overlap){ + die "The option '--no_overlap' can only be specified for paired-end input!\n" unless ($paired_end); + } + else{ + $no_overlap = 0; + } + + ### COMPREHENSIVE OUTPUT + unless ($full){ + $full = 0; + } + + ### MERGE NON-CpG context + unless ($merge_non_CpG){ + $merge_non_CpG = 0; + } + + ### remove white spaces in read ID (needed for sorting using the sort command + unless ($remove){ + $remove = 0; + } + + ### COVERAGE THRESHOLD FOR bedGraph OUTPUT + if (defined $coverage_threshold){ + unless ($coverage_threshold > 0){ + die "Please select a coverage greater than 0 (positive integers only)\n"; + } + } + else{ + $coverage_threshold = 1; + } + + ### SORT buffer size + if (defined $sort_size){ + unless ($sort_size =~ /^\d+\%$/ or $sort_size =~ /^\d+(K|M|G|T)$/){ + die "Please select a buffer size as percentage (e.g. --buffer_size 20%) or a number to be multiplied with K, M, G, T etc. (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line\n"; + } + } + else{ + $sort_size = '2G'; + } + + if ($zero){ + die "Option '--zero' is only available if '--cytosine_report' is specified as well. Please respecify\n" unless ($cytosine_report); + } + + if ($CX_context){ + die "Option '--CX_context' is only available if '--cytosine_report' or '--bedGraph' is specified as well. Please respecify\n" unless ($cytosine_report or $bedGraph); + } + else{ + $CX_context = 0; + } + + unless ($counts){ + $counts = 1; # counts will always be set + } + + if ($cytosine_report){ + + ### GENOME folder + if ($genome_folder){ + unless ($genome_folder =~/\/$/){ + $genome_folder =~ s/$/\//; + } + } + else{ + die "Please specify a genome folder to proceed (full path only)\n"; + } + + unless ($bedGraph){ + warn "Setting the option '--bedGraph' since this is required for the genome-wide cytosine report\n"; + $bedGraph = 1; + } + unless ($counts){ + # warn "Setting the option '--counts' since this is required for the genome-wide cytosine report\n"; + $counts = 1; + } + warn "\n"; + } + + ### PATH TO SAMTOOLS + if (defined $samtools_path){ + # if Samtools was specified as full command + if ($samtools_path =~ /samtools$/){ + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + else{ + unless ($samtools_path =~ /\/$/){ + $samtools_path =~ s/$/\//; + } + $samtools_path .= 'samtools'; + if (-e $samtools_path){ + # Samtools executable found + } + else{ + die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; + } + } + } + # Check whether Samtools is in the PATH if no path was supplied by the user + else{ + if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH + $samtools_path = `which samtools`; + chomp $samtools_path; + } + } + + unless (defined $samtools_path){ + $samtools_path = ''; + } + + + if ($gazillion){ + if ($ample_mem){ + die "You can't currently select '--ample_mem' together with '--gazillion'. Make your pick!\n\n"; + } + } + + return ($ignore,$genomic_fasta,$single_end,$paired_end,$full,$report,$no_overlap,$merge_non_CpG,$vanilla,$output_dir,$no_header,$bedGraph,$remove,$coverage_threshold,$counts,$cytosine_report,$genome_folder,$zero,$CpG_only,$CX_context,$split_by_chromosome,$sort_size,$samtools_path,$gzip,$ignore_r2,$mbias_only,$gazillion,$ample_mem); +} + + +sub test_positional_sorting{ + + my $filename = shift; + + print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t"; + sleep(1); + + if ($filename =~ /\.gz$/) { + open (TEST,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) { + if ($samtools_path){ + open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (TEST,$filename) or die "Can't open file $filename: $!\n"; + } + + my $count = 0; + + while () { + if (/^\@/) { # testing header lines if they contain the @SO flag (for being sorted) + if (/^\@SO/) { + die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead\n\n"; + } + next; + } + $count++; + + last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID + + my ($id_1) = (split (/\t/)); + + ### reading the next line which should be read 2 + $_ = ; + my ($id_2) = (split (/\t/)); + last unless ($id_2); + ++$count; + + if ($id_1 eq $id_2){ + ### ids are the same + next; + } + else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first + my $id_1_trunc = $id_1; + $id_1_trunc =~ s/\/1$//; + my $id_2_trunc = $id_2; + $id_2_trunc =~ s/\/2$//; + + unless ($id_1_trunc eq $id_2_trunc){ + die "The IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead\n\n"; + } + } + } + # close TEST or die $!; somehow fails on our cluster... + ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other) + warn "...passed!\n"; + sleep(1); + +} + + +sub process_Bismark_results_file{ + my $filename = shift; + + warn "\nNow reading in Bismark result file $filename\n\n"; + + if ($filename =~ /\.gz$/) { + open (IN,"zcat $filename |") or die "Can't open gzipped file $filename: $!\n"; + } + elsif ($filename =~ /bam$/ || `file -b $filename` =~ /^gzip/) { + if ($samtools_path){ + open (IN,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n"; + } + else{ + die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n"; + } + } + else { + open (IN,$filename) or die "Can't open file $filename: $!\n"; + } + + ### Vanilla and SAM output need to read different numbers of header lines + if ($vanilla) { + my $bismark_version = ; ## discarding the Bismark version info + chomp $bismark_version; + $bismark_version =~ s/\r//; # replaces \r line feed + $bismark_version =~ s/Bismark version: //; + if ($bismark_version =~ /^\@/) { + warn "Detected \@ as the first character of the version information. Is it possible that the file is in SAM format?\n\n"; + sleep (2); + } + + unless ($version eq $bismark_version){ + die "The methylation extractor and Bismark itself need to be of the same version!\n\nVersions used:\nmethylation extractor: '$version'\nBismark: '$bismark_version'\n"; + } + } else { + # If the read is in SAM format (default) it can either start with @ header lines or start with alignments directly. + # We are reading from it further down + } + + my $output_filename = (split (/\//,$filename))[-1]; + + ### OPENING OUTPUT-FILEHANDLES + if ($report) { + my $report_filename = $output_filename; + $report_filename =~ s/\.sam$//; + $report_filename =~ s/\.txt$//; + $report_filename =~ s/$/_splitting_report.txt/; + $report_filename = $output_dir . $report_filename; + open (REPORT,'>',$report_filename) or die "Failed to write to file $report_filename $!\n"; + } + + if ($report) { + print REPORT "$output_filename\n\n"; + print REPORT "Parameters used to extract methylation information:\n"; + if ($paired) { + if ($vanilla) { + print REPORT "Bismark result file: paired-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: paired-end (SAM format)\n"; # default + } + } + + if ($single) { + if ($vanilla) { + print REPORT "Bismark result file: single-end (vanilla Bismark format)\n"; + } else { + print REPORT "Bismark result file: single-end (SAM format)\n"; # default + } + } + if ($single){ + if ($ignore) { + print REPORT "Ignoring first $ignore bp\n"; + } + } + else{ # paired-end + if ($ignore) { + print REPORT "Ignoring first $ignore bp of Read 1\n"; + } + if ($ignore_r2){ + print REPORT "Ignoring first $ignore_r2 bp of Read 2\n"; + } + } + + if ($full) { + print REPORT "Output specified: comprehensive\n"; + } else { + print REPORT "Output specified: strand-specific (default)\n"; + } + + if ($no_overlap) { + print REPORT "No overlapping methylation calls specified\n"; + } + if ($genomic_fasta) { + print REPORT "Genomic equivalent sequences will be printed out in FastA format\n"; + } + if ($merge_non_CpG) { + print REPORT "Methylation in CHG and CHH context will be merged into \"non-CpG context\" output\n"; + } + + print REPORT "\n"; + } + +##### open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; + + ### CpG-context and non-CpG context. THIS SECTION IS OPTIONAL + ### if --comprehensive AND --merge_non_CpG was specified we are only writing out one CpG-context and one Any-Other-context result file + if ($full and $merge_non_CpG) { + my $cpg_output = my $other_c_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + else{ + open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); + push @sorting_files,$cpg_output; + + unless ($no_header) { + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in any other context than CpG + $other_c_output =~ s/^/Non_CpG_context_/; + $other_c_output =~ s/sam$/txt/; + $other_c_output =~ s/bam$/txt/; + $other_c_output =~ s/$/.txt/ unless ($other_c_output =~ /\.txt$/); + $other_c_output = $output_dir . $other_c_output; + + if ($gzip){ + $other_c_output .= '.gz'; + open ($fhs{other_context},"| gzip -c - > $other_c_output") or die "Failed to write to $other_c_output $! \n" unless($mbias_only); + } + else{ + open ($fhs{other_context},'>',$other_c_output) or die "Failed to write to $other_c_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context to $other_c_output\n" unless($mbias_only); + push @sorting_files,$other_c_output; + + + unless ($no_header) { + print {$fhs{other_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + + ### if only --merge_non_CpG was specified we will write out 8 different output files, depending on where the (first) unique best alignment has been found + elsif ($merge_non_CpG) { + + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in Non-CpG (CC, CT or CA) context + my $other_c_ot = my $other_c_ctot = my $other_c_ctob = my $other_c_ob = $output_filename; + + $other_c_ot =~ s/^/Non_CpG_OT_/; + $other_c_ot =~ s/sam$/txt/; + $other_c_ot =~ s/bam$/txt/; + $other_c_ot =~ s/$/.txt/ unless ($other_c_ot =~ /\.txt$/); + $other_c_ot = $output_dir . $other_c_ot; + + if ($gzip){ + $other_c_ot .= '.gz'; + open ($fhs{0}->{other_c},"| gzip -c - > $other_c_ot") or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{other_c},'>',$other_c_ot) or die "Failed to write to $other_c_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the original top strand to $other_c_ot\n" unless($mbias_only); + push @sorting_files,$other_c_ot; + + unless($no_header){ + print {$fhs{0}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ctot =~ s/^/Non_CpG_CTOT_/; + $other_c_ctot =~ s/sam$/txt/; + $other_c_ctot =~ s/bam$/txt/; + $other_c_ctot =~ s/$/.txt/ unless ($other_c_ctot =~ /\.txt$/); + $other_c_ctot = $output_dir . $other_c_ctot; + + if ($gzip){ + $other_c_ctot .= '.gz'; + open ($fhs{1}->{other_c},"| gzip -c - > $other_c_ctot") or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{other_c},'>',$other_c_ctot) or die "Failed to write to $other_c_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original top strand to $other_c_ctot\n" unless($mbias_only); + push @sorting_files,$other_c_ctot; + + unless($no_header){ + print {$fhs{1}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ctob =~ s/^/Non_CpG_CTOB_/; + $other_c_ctob =~ s/sam$/txt/; + $other_c_ctob =~ s/bam$/txt/; + $other_c_ctob =~ s/$/.txt/ unless ($other_c_ctob =~ /\.txt$/); + $other_c_ctob = $output_dir . $other_c_ctob; + + if ($gzip){ + $other_c_ctob .= '.gz'; + open ($fhs{2}->{other_c},"| gzip -c - > $other_c_ctob") or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{other_c},'>',$other_c_ctob) or die "Failed to write to $other_c_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the complementary to original bottom strand to $other_c_ctob\n" unless($mbias_only); + push @sorting_files,$other_c_ctob; + + unless($no_header){ + print {$fhs{2}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $other_c_ob =~ s/^/Non_CpG_OB_/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/sam$/txt/; + $other_c_ob =~ s/$/.txt/ unless ($other_c_ob =~ /\.txt$/); + $other_c_ob = $output_dir . $other_c_ob; + + if ($gzip){ + $other_c_ob .= '.gz'; + open ($fhs{3}->{other_c},"| gzip -c - > $other_c_ob") or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{other_c},'>',$other_c_ob) or die "Failed to write to $other_c_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in any other context from the original bottom strand to $other_c_ob\n\n" unless($mbias_only); + push @sorting_files,$other_c_ob; + + unless($no_header){ + print {$fhs{3}->{other_c}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + ### THIS SECTION IS THE DEFAULT (CpG, CHG and CHH context) + + ### if --comprehensive was specified we are only writing one file per context + elsif ($full) { + my $cpg_output = my $chg_output = my $chh_output = $output_filename; + ### C in CpG context + $cpg_output =~ s/^/CpG_context_/; + $cpg_output =~ s/sam$/txt/; + $cpg_output =~ s/bam$/txt/; + $cpg_output =~ s/$/.txt/ unless ($cpg_output =~ /\.txt$/); + $cpg_output = $output_dir . $cpg_output; + + if ($gzip){ + $cpg_output .= '.gz'; + open ($fhs{CpG_context},"| gzip -c - > $cpg_output") or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + else{ + open ($fhs{CpG_context},'>',$cpg_output) or die "Failed to write to $cpg_output $! \n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context to $cpg_output\n" unless($mbias_only); + push @sorting_files,$cpg_output; + + unless($no_header){ + print {$fhs{CpG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in CHG context + $chg_output =~ s/^/CHG_context_/; + $chg_output =~ s/sam$/txt/; + $chg_output =~ s/bam$/txt/; + $chg_output =~ s/$/.txt/ unless ($chg_output =~ /\.txt$/); + $chg_output = $output_dir . $chg_output; + + if ($gzip){ + $chg_output .= '.gz'; + open ($fhs{CHG_context},"| gzip -c - > $chg_output") or die "Failed to write to $chg_output $!\n" unless($mbias_only); + } + else{ + open ($fhs{CHG_context},'>',$chg_output) or die "Failed to write to $chg_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context to $chg_output\n" unless($mbias_only); + push @sorting_files,$chg_output; + + unless($no_header){ + print {$fhs{CHG_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### C in CHH context + $chh_output =~ s/^/CHH_context_/; + $chh_output =~ s/sam$/txt/; + $chh_output =~ s/bam$/txt/; + $chh_output =~ s/$/.txt/ unless ($chh_output =~ /\.txt$/); + $chh_output = $output_dir . $chh_output; + + if ($gzip){ + $chh_output .= '.gz'; + open ($fhs{CHH_context},"| gzip -c - > $chh_output") or die "Failed to write to $chh_output $!\n" unless($mbias_only); + } + else{ + open ($fhs{CHH_context},'>',$chh_output) or die "Failed to write to $chh_output $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context to $chh_output\n" unless($mbias_only); + push @sorting_files, $chh_output; + + unless($no_header){ + print {$fhs{CHH_context}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + ### else we will write out 12 different output files, depending on where the (first) unique best alignment was found + else { + my $cpg_ot = my $cpg_ctot = my $cpg_ctob = my $cpg_ob = $output_filename; + + ### For cytosines in CpG context + $cpg_ot =~ s/^/CpG_OT_/; + $cpg_ot =~ s/sam$/txt/; + $cpg_ot =~ s/bam$/txt/; + $cpg_ot =~ s/$/.txt/ unless ($cpg_ot =~ /\.txt$/); + $cpg_ot = $output_dir . $cpg_ot; + + if ($gzip){ + $cpg_ot .= '.gz'; + open ($fhs{0}->{CpG},"| gzip -c - > $cpg_ot") or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CpG},'>',$cpg_ot) or die "Failed to write to $cpg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original top strand to $cpg_ot\n" unless($mbias_only); + push @sorting_files,$cpg_ot; + + unless($no_header){ + print {$fhs{0}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctot =~ s/^/CpG_CTOT_/; + $cpg_ctot =~ s/sam$/txt/; + $cpg_ctot =~ s/bam$/txt/; + $cpg_ctot =~ s/$/.txt/ unless ($cpg_ctot =~ /\.txt$/); + $cpg_ctot = $output_dir . $cpg_ctot; + + if ($gzip){ + $cpg_ctot .= '.gz'; + open ($fhs{1}->{CpG},"| gzip -c - > $cpg_ctot") or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CpG},'>',$cpg_ctot) or die "Failed to write to $cpg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original top strand to $cpg_ctot\n" unless($mbias_only); + push @sorting_files,$cpg_ctot; + + unless($no_header){ + print {$fhs{1}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ctob =~ s/^/CpG_CTOB_/; + $cpg_ctob =~ s/sam$/txt/; + $cpg_ctob =~ s/bam$/txt/; + $cpg_ctob =~ s/$/.txt/ unless ($cpg_ctob =~ /\.txt$/); + $cpg_ctob = $output_dir . $cpg_ctob; + + if ($gzip){ + $cpg_ctob .= '.gz'; + open ($fhs{2}->{CpG},"| gzip -c - > $cpg_ctob") or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CpG},'>',$cpg_ctob) or die "Failed to write to $cpg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the complementary to original bottom strand to $cpg_ctob\n" unless($mbias_only); + push @sorting_files,$cpg_ctob; + + unless($no_header){ + print {$fhs{2}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $cpg_ob =~ s/^/CpG_OB_/; + $cpg_ob =~ s/sam$/txt/; + $cpg_ob =~ s/bam$/txt/; + $cpg_ob =~ s/$/.txt/ unless ($cpg_ob =~ /\.txt$/); + $cpg_ob = $output_dir . $cpg_ob; + + if ($gzip){ + $cpg_ob .= '.gz'; + open ($fhs{3}->{CpG},"| gzip -c - > $cpg_ob") or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CpG},'>',$cpg_ob) or die "Failed to write to $cpg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CpG context from the original bottom strand to $cpg_ob\n\n" unless($mbias_only); + push @sorting_files,$cpg_ob; + + unless($no_header){ + print {$fhs{3}->{CpG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in CHG context + my $chg_ot = my $chg_ctot = my $chg_ctob = my $chg_ob = $output_filename; + + $chg_ot =~ s/^/CHG_OT_/; + $chg_ot =~ s/sam$/txt/; + $chg_ot =~ s/bam$/txt/; + $chg_ot =~ s/$/.txt/ unless ($chg_ot =~ /\.txt$/); + $chg_ot = $output_dir . $chg_ot; + + if ($gzip){ + $chg_ot .= '.gz'; + open ($fhs{0}->{CHG},"| gzip -c - > $chg_ot") or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CHG},'>',$chg_ot) or die "Failed to write to $chg_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the original top strand to $chg_ot\n" unless($mbias_only); + push @sorting_files,$chg_ot; + + unless($no_header){ + print {$fhs{0}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ctot =~ s/^/CHG_CTOT_/; + $chg_ctot =~ s/sam$/txt/; + $chg_ctot =~ s/bam$/txt/; + $chg_ctot =~ s/$/.txt/ unless ($chg_ctot =~ /\.txt$/); + $chg_ctot = $output_dir . $chg_ctot; + + if ($gzip){ + $chg_ctot .= '.gz'; + open ($fhs{1}->{CHG},"| gzip -c - > $chg_ctot") or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CHG},'>',$chg_ctot) or die "Failed to write to $chg_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original top strand to $chg_ctot\n" unless($mbias_only); + push @sorting_files,$chg_ctot; + + unless($no_header){ + print {$fhs{1}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ctob =~ s/^/CHG_CTOB_/; + $chg_ctob =~ s/sam$/txt/; + $chg_ctob =~ s/bam$/txt/; + $chg_ctob =~ s/$/.txt/ unless ($chg_ctob =~ /\.txt$/); + $chg_ctob = $output_dir . $chg_ctob; + + if ($gzip){ + $chg_ctob .= '.gz'; + open ($fhs{2}->{CHG},"| gzip -c - > $chg_ctob") or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CHG},'>',$chg_ctob) or die "Failed to write to $chg_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the complementary to original bottom strand to $chg_ctob\n" unless($mbias_only); + push @sorting_files,$chg_ctob; + + unless($no_header){ + print {$fhs{2}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chg_ob =~ s/^/CHG_OB_/; + $chg_ob =~ s/sam$/txt/; + $chg_ob =~ s/bam$/txt/; + $chg_ob =~ s/$/.txt/ unless ($chg_ob =~ /\.txt$/); + $chg_ob = $output_dir . $chg_ob; + + if ($gzip){ + $chg_ob .= '.gz'; + open ($fhs{3}->{CHG},"| gzip -c - > $chg_ob") or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CHG},'>',$chg_ob) or die "Failed to write to $chg_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHG context from the original bottom strand to $chg_ob\n\n" unless($mbias_only); + push @sorting_files,$chg_ob; + + unless($no_header){ + print {$fhs{3}->{CHG}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + ### For cytosines in CHH context + my $chh_ot = my $chh_ctot = my $chh_ctob = my $chh_ob = $output_filename; + + $chh_ot =~ s/^/CHH_OT_/; + $chh_ot =~ s/sam$/txt/; + $chh_ot =~ s/bam$/txt/; + $chh_ot =~ s/$/.txt/ unless ($chh_ot =~ /\.txt$/); + $chh_ot = $output_dir . $chh_ot; + + if ($gzip){ + $chh_ot .= '.gz'; + open ($fhs{0}->{CHH},"| gzip -c - > $chh_ot") or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + } + else{ + open ($fhs{0}->{CHH},'>',$chh_ot) or die "Failed to write to $chh_ot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the original top strand to $chh_ot\n" unless($mbias_only); + push @sorting_files,$chh_ot; + + unless($no_header){ + print {$fhs{0}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ctot =~ s/^/CHH_CTOT_/; + $chh_ctot =~ s/sam$/txt/; + $chh_ctot =~ s/bam$/txt/; + $chh_ctot =~ s/$/.txt/ unless ($chh_ctot =~ /\.txt$/); + $chh_ctot = $output_dir . $chh_ctot; + + if ($gzip){ + $chh_ctot .= '.gz'; + open ($fhs{1}->{CHH},"| gzip -c - > $chh_ctot") or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + } + else{ + open ($fhs{1}->{CHH},'>',$chh_ctot) or die "Failed to write to $chh_ctot $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original top strand to $chh_ctot\n" unless($mbias_only); + push @sorting_files,$chh_ctot; + + unless($no_header){ + print {$fhs{1}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ctob =~ s/^/CHH_CTOB_/; + $chh_ctob =~ s/sam$/txt/; + $chh_ctob =~ s/bam$/txt/; + $chh_ctob =~ s/$/.txt/ unless ($chh_ctob =~ /\.txt$/); + $chh_ctob = $output_dir . $chh_ctob; + + if ($gzip){ + $chh_ctob .= '.gz'; + open ($fhs{2}->{CHH},"| gzip -c - > $chh_ctob") or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + } + else{ + open ($fhs{2}->{CHH},'>',$chh_ctob) or die "Failed to write to $chh_ctob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the complementary to original bottom strand to $chh_ctob\n" unless($mbias_only); + push @sorting_files,$chh_ctob; + + unless($no_header){ + print {$fhs{2}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + + $chh_ob =~ s/^/CHH_OB_/; + $chh_ob =~ s/sam$/txt/; + $chh_ob =~ s/bam$/txt/; + $chh_ob =~ s/$/.txt/ unless ($chh_ob =~ /\.txt$/); + $chh_ob = $output_dir . $chh_ob; + + if ($gzip){ + $chh_ob .= '.gz'; + open ($fhs{3}->{CHH},"| gzip -c - > $chh_ob") or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + } + else{ + open ($fhs{3}->{CHH},'>',$chh_ob) or die "Failed to write to $chh_ob $!\n" unless($mbias_only); + } + + warn "Writing result file containing methylation information for C in CHH context from the original bottom strand to $chh_ob\n\n" unless($mbias_only); + push @sorting_files,$chh_ob; + + unless($no_header){ + print {$fhs{3}->{CHH}} "Bismark methylation extractor version $version\n" unless($mbias_only); + } + } + + my $methylation_call_strings_processed = 0; + my $line_count = 0; + + ### proceeding differently now for single-end or paired-end Bismark files + + ### PROCESSING SINGLE-END RESULT FILES + if ($single) { + + ### also proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark output format + while () { + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start,$seq,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,6,7,8,9]; + + ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the first or + ### last position + chomp $genome_conversion; + + my $index; + if ($meth_call) { + + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### Clipping off the first number of bases from the methylation call string as specified with --ignore + if ($ignore) { + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + $start += $ignore; + } elsif ($strand eq '-') { + $start += length($meth_call)-1; ## $meth_call is already shortened! + } else { + die "Alignment did not have proper strand information: $strand\n"; + } + } + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } else { # processing single-end SAM format (default) + while () { + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_"; + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + # example read in SAM format + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + ### + + # < 0.7.6 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + # < 0.7.6 $meth_call =~ s/^XM:Z://; + # < 0.7.6 $read_conversion =~ s/^XR:Z://; + # < 0.7.6 $genome_conversion =~ s/^XG:Z://; + + my ($id,$chrom,$start,$cigar) = (split("\t"))[0,2,3,5]; + + ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call; ### Thanks to Zachary Zeno for this solution + my $read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call = $value; + $meth_call =~ s/\r//; + } elsif ($tag eq "XR") { + $read_conversion = $value; + $read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + my $strand; + chomp $genome_conversion; + # print "$meth_call\n$read_conversion\n$genome_conversion\n"; + + my $index; + if ($meth_call) { + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT') { ## original top strand + $index = 0; + $strand = '+'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT') { ## complementary to original top strand + $index = 1; + $strand = '-'; + } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA') { ## complementary to original bottom strand + $index = 2; + $strand = '+'; + } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA') { ## original bottom strand + $index = 3; + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: '$read_conversion' / '$genome_conversion'\n"; + } + + ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output + if ($strand eq '-') { + $meth_call = reverse $meth_call; + } + + ### Clipping off the first number of bases from the methylation call string as specified with --ignore + if ($ignore) { + # print "\n\n$meth_call\n"; + $meth_call = substr($meth_call,$ignore,length($meth_call)-$ignore); + # print "$meth_call\n"; + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + my @len = split (/\D+/,$cigar); # storing the length per operation + my @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + my @comp_cigar; # building an array with all CIGAR operations + foreach my $index (0..$#len) { + foreach (1..$len[$index]) { + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # print "original CIGAR: $cigar\n"; + # print "original CIGAR: @comp_cigar\n"; + + ### If we are clipping off some bases at the start we need to adjust the start position of the alignments accordingly! + if ($strand eq '+') { + + my $D_count = 0; # counting all deletions that affect the ignored genomic position, i.e. Deletions and insertions + my $I_count = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count++; + $op = shift @comp_cigar; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count++; + } + } + $start += $ignore + $D_count - $I_count; + # print "start $start\t ignore: $ignore\t D count: $D_count I_count: $I_count\n"; + } elsif ($strand eq '-') { + + for (1..$ignore) { + my $op = pop @comp_cigar; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar; + } + } + + ### For reverse strand alignments we need to determine the number of matching bases (M) or deletions (D) in the read from the CIGAR + ### string to be able to work out the starting position of the read which is on the 3' end of the sequence + my $MD_count = 0; # counting all operations that affect the genomic position, i.e. M and D. Insertions do not affect the start position + foreach (@comp_cigar) { + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); + } + $start += $MD_count - 1; + } + + ### reconstituting shortened CIGAR string + my $new_cigar; + my $count = 0; + my $last_op; + # print "ignore adjusted: @comp_cigar\n"; + foreach my $op (@comp_cigar) { + unless (defined $last_op){ + $last_op = $op; + ++$count; + next; + } + if ($last_op eq $op) { + ++$count; + } else { + $new_cigar .= "$count$last_op"; + $last_op = $op; + $count = 1; + } + } + $new_cigar .= "$count$last_op"; # appending the last operation and count + $cigar = $new_cigar; + # print "ignore adjusted scalar: $cigar\n"; + } + } + ### printing out the methylation state of every C in the read + print_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index,$cigar); + + ++$methylation_call_strings_processed; # 1 per single-end result + } + } + } + + ### PROCESSING PAIRED-END RESULT FILES + elsif ($paired) { + + ### proceeding differently now for SAM format or vanilla Bismark format files + if ($vanilla) { # old vanilla Bismark paired-end output format + while () { + ++$line_count; + warn "processed line: $line_count\n" if ($line_count%500000==0); + + ### $seq here is the chromosomal sequence (to use for the repeat analysis for example) + my ($id,$strand,$chrom,$start_read_1,$end_read_2,$seq_1,$meth_call_1,$seq_2,$meth_call_2,$first_read_conversion,$genome_conversion) = (split("\t"))[0,1,2,3,4,6,7,9,10,11,12,13]; + + my $index; + chomp $genome_conversion; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB!!! + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT!!! + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + if ($meth_call_1 and $meth_call_2) { + ### Clipping off the first number of bases from the methylation call strings as specified with '--ignore ' + + if ($ignore) { + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore' was specified + $start_read_1 += $ignore; + } + if ($ignore_r2) { + $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); + + ### we also need to adjust the start and end positions of the alignments accordingly if '--ignore_r2' was specified + $end_read_2 -= $ignore_r2; + } + + my $end_read_1; + my $start_read_2; + + if ($strand eq '+') { + + $end_read_1 = $start_read_1+length($meth_call_1)-1; + $start_read_2 = $end_read_2-length($meth_call_2)+1; + + ## we first pass the first read which is in + orientation on the forward strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id,'+',$index,0,0,undef,1); # the last two values are CIGAR string and read identity + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$end_read_2,$id,'-',$index,$no_overlap,$end_read_1,undef,2); + } + else { + + $end_read_1 = $start_read_1+length($meth_call_2)-1; # read 1 is the second reported read! + $start_read_2 = $end_read_2-length($meth_call_1)+1; # read 2 is the first reported read! + + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$end_read_2,$id,'-',$index,0,0,undef,1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 2. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_1,$id,'+',$index,$no_overlap,$start_read_2,undef,2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + } + } + else { # Bismark paired-end SAM output format (default) + while () { + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/) { # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_"; + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + # example paired-end reads in SAM format (2 consecutive lines) + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT + + # < version 0.7.6 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + + my ($id_1,$chrom,$start_read_1,$cigar_1) = (split("\t"))[0,2,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + my $meth_call_1; + my $first_read_conversion; + my $genome_conversion; + + while ( /(XM|XR|XG):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_1 = $value; + $meth_call_1 =~ s/\r//; + } elsif ($tag eq "XR") { + $first_read_conversion = $value; + $first_read_conversion =~ s/\r//; + } elsif ($tag eq "XG") { + $genome_conversion = $value; + $genome_conversion =~ s/\r//; + } + } + + $_ = ; # reading in the paired read + + # < version 0.7.6 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; + # < version 0.7.6 $meth_call_1 =~ s/^XM:Z://; + # < version 0.7.6 $meth_call_2 =~ s/^XM:Z://; + # < version 0.7.6 $first_read_conversion =~ s/^XR:Z://; + # < version 0.7.6 $second_read_conversion =~ s/^XR:Z://; + + my ($id_2,$start_read_2,$cigar_2) = (split("\t"))[0,3,5]; ### detecting the following SAM flags in case the SAM entry was shuffled by CRAM or Goby compression/decompression + + my $meth_call_2; + my $second_read_conversion; + + while ( /(XM|XR):Z:([^\t]+)/g ) { + my $tag = $1; + my $value = $2; + + if ($tag eq "XM") { + $meth_call_2 = $value; + $meth_call_2 =~ s/\r//; + } elsif ($tag eq "XR") { + $second_read_conversion = $value; + $second_read_conversion = s/\r//; + } + } + + # < version 0.7.6 $genome_conversion =~ s/^XG:Z://; + chomp $genome_conversion; # in case it captured a new line character + + # print join ("\t",$meth_call_1,$meth_call_2,$first_read_conversion,$second_read_conversion,$genome_conversion),"\n"; + + my $index; + my $strand; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT') { + $index = 0; ## this is OT + $strand = '+'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT') { + $index = 1; ## this is CTOT + $strand = '-'; + } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA') { + $index = 2; ## this is CTOB + $strand = '+'; + } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA') { + $index = 3; ## this is OB + $strand = '-'; + } else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + ### reversing the methylation call of the read that was reverse-complemented + if ($strand eq '+') { + $meth_call_2 = reverse $meth_call_2; + } else { + $meth_call_1 = reverse $meth_call_1; + } + + if ($meth_call_1 and $meth_call_2) { + + my $end_read_1; + + ### READ 1 + my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation + my @ops_1 = split (/\d+/,$cigar_1); # storing the operation + shift @ops_1; # remove the empty first element + + die "CIGAR string contained a non-matching number of lengths and operations: $cigar_1\n".join(" ",@len_1)."\n".join(" ",@ops_1)."\n" unless (scalar @len_1 == scalar @ops_1); + + my @comp_cigar_1; # building an array with all CIGAR operations + foreach my $index (0..$#len_1) { + foreach (1..$len_1[$index]) { + # print "$ops_1[$index]"; + push @comp_cigar_1, $ops_1[$index]; + } + } + # print "original CIGAR read 1: $cigar_1\n"; + # print "original CIGAR read 1: @comp_cigar_1\n"; + + ### READ 2 + my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation + my @ops_2 = split (/\d+/,$cigar_2); # storing the operation + shift @ops_2; # remove the empty first element + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); + my @comp_cigar_2; # building an array with all CIGAR operations for read 2 + foreach my $index (0..$#len_2) { + foreach (1..$len_2[$index]) { + # print "$ops_2[$index]"; + push @comp_cigar_2, $ops_2[$index]; + } + } + # print "original CIGAR read 2: $cigar_2\n"; + # print "original CIGAR read 2: @comp_cigar_2\n"; + + + + if ($ignore) { + ### Clipping off the first number of bases from the methylation call strings as specified with '--ignore ' for read 1 + ### the methylation calls have already been reversed where necessary + $meth_call_1 = substr($meth_call_1,$ignore,length($meth_call_1)-$ignore); + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', read 1 needs to be trimmed from the start + my $D_count_1 = 0; # counting all deletions that affect the ignored genomic position for read 1, i.e. Deletions and insertions + my $I_count_1 = 0; + + for (1..$ignore) { + my $op = shift @comp_cigar_1; # adjusting composite CIGAR string of read 1 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_1++; + $op = shift @comp_cigar_1; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_1++; + } + } + + $start_read_1 += $ignore + $D_count_1 - $I_count_1; + # print "start read 1 $start_read_1\t ignore: $ignore\t D count 1: $D_count_1\tI_count 1: $I_count_1\n"; + + # the start position of reads mapping to the reverse strand is being adjusted further below + } + elsif ($strand eq '-') { + + ### if the (read 1) strand information is '-', read 1 needs to be trimmed from the back + for (1..$ignore) { + my $op = pop @comp_cigar_1; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_1; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + + } + } + + if ($ignore_r2) { + ### Clipping off the first number of bases from the methylation call string as specified with '--ignore_r2 ' for read 2 + ### the methylation calls have already been reversed where necessary + $meth_call_2 = substr($meth_call_2,$ignore_r2,length($meth_call_2)-$ignore_r2); + + ### If we are ignoring a part of the sequence we also need to adjust the cigar string accordingly + + if ($strand eq '+') { + + ### if the (read 1) strand information is '+', read 2 needs to be trimmed from the back + + for (1..$ignore_r2) { + my $op = pop @comp_cigar_2; # adjusting composite CIGAR string by removing $ignore operations, here the last value of the array + while ($op eq 'D') { # repeating this for deletions (D) + $op = pop @comp_cigar_2; + } + } + # the start position of reads mapping to the reverse strand is being adjusted further below + } + elsif ($strand eq '-') { + + ### if the (read 1) strand information is '-', read 2 needs to be trimmed from the start + my $D_count_2 = 0; # counting all deletions that affect the ignored genomic position for read 2, i.e. Deletions and insertions + my $I_count_2 = 0; + + for (1..$ignore_r2) { + my $op = shift @comp_cigar_2; # adjusting composite CIGAR string of read 2 by removing $ignore operations from the start + # print "$_ deleted $op\n"; + + while ($op eq 'D') { # repeating this for deletions (D) + $D_count_2++; + $op = shift @comp_cigar_2; + # print "$_ deleted $op\n"; + } + if ($op eq 'I') { # adjusting the genomic position for insertions (I) + $I_count_2++; + } + } + + $start_read_2 += $ignore_r2 + $D_count_2 - $I_count_2; + # print "start read 2 $start_read_2\t ignore R2: $ignore_r2\t D count 2: $D_count_2\tI_count 2: $I_count_2\n"; + } + } + + if ($ignore){ + ### reconstituting shortened CIGAR string 1 + my $new_cigar_1; + my $count_1 = 0; + my $last_op_1; + # print "ignore adjusted CIGAR 1: @comp_cigar_1\n"; + foreach my $op (@comp_cigar_1) { + unless (defined $last_op_1){ + $last_op_1 = $op; + ++$count_1; + next; + } + if ($last_op_1 eq $op) { + ++$count_1; + } else { + $new_cigar_1 .= "$count_1$last_op_1"; + $last_op_1 = $op; + $count_1 = 1; + } + } + $new_cigar_1 .= "$count_1$last_op_1"; # appending the last operation and count + $cigar_1 = $new_cigar_1; + # print "ignore adjusted CIGAR 1 scalar: $cigar_1\n"; + } + + if ($ignore_r2){ + + ### reconstituting shortened CIGAR string 2 + my $new_cigar_2; + my $count_2 = 0; + my $last_op_2; + # print "ignore adjusted CIGAR 2: @comp_cigar_2\n"; + foreach my $op (@comp_cigar_2) { + unless (defined $last_op_2){ + $last_op_2 = $op; + ++$count_2; + next; + } + if ($last_op_2 eq $op) { + ++$count_2; + } + else { + $new_cigar_2 .= "$count_2$last_op_2"; + $last_op_2 = $op; + $count_2 = 1; + } + } + $new_cigar_2 .= "$count_2$last_op_2"; # appending the last operation and count + $cigar_2 = $new_cigar_2; + # print "ignore_r2 adjusted CIGAR 2 scalar: $cigar_2\n"; + } + + ### Adjusting CIGAR string and starting position of reads in reverse orientation which we will pass to the extraction subroutine later on + + if ($strand eq '+') { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 2 + @comp_cigar_2 = reverse@comp_cigar_2; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_2\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + my $MD_count_2 = 0; + foreach (@comp_cigar_2) { + ++$MD_count_2 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1 + $MD_count_1 - 1; + $start_read_2 += $MD_count_2 - 1; ## Passing on the start position on the reverse strand + } + else { + ### adjusting the start position for all reads mapping to the reverse strand, in this case read 1 + + @comp_cigar_1 = reverse@comp_cigar_1; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print "reverse: @comp_cigar_1\n"; + + my $MD_count_1 = 0; + foreach (@comp_cigar_1) { + ++$MD_count_1 if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + + $end_read_1 = $start_read_1; + $start_read_1 += $MD_count_1 - 1; ### Passing on the start position on the reverse strand + } + + if ($strand eq '+') { + ## we first pass the first read which is in + orientation on the forward strand; the last value is the read identity + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0,0,$cigar_1,1); + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$no_overlap,$end_read_1,$cigar_2,2); + } else { + ## we first pass the first read which is in - orientation on the reverse strand + print_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0,0,$cigar_1,1); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + print_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$no_overlap,$end_read_1,$cigar_2,2); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + } + } + } else { + die "Single-end or paired-end reads not specified properly\n"; + } + + warn "\n\nProcessed $line_count lines from $filename in total\n"; + warn "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; + if ($report) { + print REPORT "\n\nProcessed $line_count lines from $filename in total\n"; + print REPORT "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; + } + print_splitting_report (); +} + + + +sub print_splitting_report{ + + ### Calculating methylation percentages if applicable + + my $percent_meCpG; + if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ + $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); + } + + my $percent_meCHG; + if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); + } + + my $percent_meCHH; + if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ + $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); + } + + my $percent_non_CpG_methylation; + if ($merge_non_CpG){ + if ( ($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ + $percent_non_CpG_methylation = sprintf("%.1f",100* ( $counting{total_meCHH_count}+$counting{total_meCHG_count} ) / ( $counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}+$counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count} ) ); + } + } + + if ($report){ + ### detailed information about Cs analysed + print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### calculating methylated CpG percentage if applicable + if ($percent_meCpG){ + print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print REPORT "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3 Context Output + else{ + ### calculating methylated CHG percentage if applicable + if ($percent_meCHG){ + print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### calculating methylated CHH percentage if applicable + if ($percent_meCHH){ + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } + } + + ### detailed information about Cs analysed for on-screen report + print "Final Cytosine Methylation Report\n",'='x33,"\n"; + + my $total_number_of_C = $counting{total_meCHG_count}+$counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; + print "Total number of C's analysed:\t$total_number_of_C\n\n"; + + print "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; + print "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; + print "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; + + print "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + + ### printing methylated CpG percentage if applicable + if ($percent_meCpG){ + print "C methylated in CpG context:\t${percent_meCpG}%\n"; + } + else{ + print "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; + } + + ### 2-Context Output + if ($merge_non_CpG){ + if ($percent_non_CpG_methylation){ + print "C methylated in non-CpG context:\t${percent_non_CpG_methylation}%\n\n\n"; + } + else{ + print "Can't determine percentage of methylated Cs in non-CpG context if value was 0\n\n\n"; + } + } + + ### 3-Context Output + else{ + ### printing methylated CHG percentage if applicable + if ($percent_meCHG){ + print "C methylated in CHG context:\t${percent_meCHG}%\n"; + } + else{ + print "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; + } + + ### printing methylated CHH percentage if applicable + if ($percent_meCHH){ + print "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + } + else{ + print "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; + } + } +} + + + +sub print_individual_C_methylation_states_paired_end_files{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$no_overlap,$end_read_1,$cigar,$read_identity) = @_; + + ### we will use the read identity for the M-bias plot to discriminate read 1 and read 2 + die "Read identity was neither 1 nor 2: $read_identity\n\n" unless ($read_identity == 1 or $read_identity == 2); + + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ### U for methylated C in Unknown context (was protected) ### + ### u for not methylated C in Unknown context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + my @comp_cigar; + + ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing + if ($cigar =~ /^\d+M$/){ + # this check speeds up the extraction process by up to 60%!!! + } + else{ # parsing CIGAR string + my @len; + my @ops; + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + if ($strand eq '-') { + + ### the CIGAR string needs to be reversed, the methylation call has already been reversed above + if (@comp_cigar){ + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + } + # print "reverse CIGAR string: @comp_cigar\n"; + + ### the start position of paired-end files has already been corrected, see above + } + + ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and non-CpG) SECTION IF --merge_non_CpG was specified + + if ($merge_non_CpG) { + if ($no_overlap) { # this has to be read 2... + + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.'){} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } + elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } else { + die "The read orientation was neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation was neither + nor -: '$strand'/n"; + } + } + } + + ### this is the default paired-end procedure allowing overlaps and using every single C position + ### Still within the 2-CONTEXT ONLY optional section + else { + ### single-file CpG and non-CpG context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + ### still within the 2-CONTEXT optional section + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + } + + ############################################ + ### THIS IS THE DEFAULT 3-CONTEXT OUTPUT ### + ############################################ + + elsif ($no_overlap) { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start+$index+$pos_offset >= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### Returning as soon as the methylation calls start overlapping + if ($start-$index+$pos_offset <= $end_read_1) { + return; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } + + ### this is the default paired-end procedure allowing overlaps and using every single C position + else { + ### single-file CpG, CHG and CHH context output + if ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM reads with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CpG}->{$index+1}->{un}++; + } + else{ + $mbias_2{CpG}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{meth}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{meth}++; + } + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + if ($read_identity == 1){ + $mbias_1{CHH}->{$index+1}->{un}++; + } + else{ + $mbias_2{CHH}->{$index+1}->{un}++; + } + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "The strand orientation as neither + nor -: '$strand'\n"; + } + } + } +} + +sub check_cigar_string { + my ($index,$cigar_offset,$pos_offset,$strand,$comp_cigar) = @_; + # print "$index\t$cigar_offset\t$pos_offset\t$strand\t"; + my ($new_cigar_offset,$new_pos_offset) = (0,0); + + if ($strand eq '+') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by -1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by +1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset -= 1; # we need to subtract the length of inserted bases from the genomic position + # warn "adjusted genomic position by another -1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset += 1; # we need to add the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another +1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + + elsif ($strand eq '-') { + # print "### $strand strand @$comp_cigar[$index + $cigar_offset]\t"; + + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "position needs no adjustment\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ # insertion in the read sequence + $new_pos_offset += 1; # we need to add the length of inserted bases to the genomic position + # warn "adjusted genomic position by +1 bp (insertion)\n"; + } + + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by -1 bp (deletion). Now looping through the CIGAR string until we hit another M or I\n"; + + while ( ($index + $cigar_offset + $new_cigar_offset) < (scalar @$comp_cigar) ){ + if (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'M'){ # sequence position matches the genomic position + # warn "Found new 'M' operation; position needs no adjustment\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'I'){ + $new_pos_offset += 1; # we need to subtract the length of inserted bases from the genomic position + # warn "Found new 'I' operation; adjusted genomic position by another +1 bp (insertion)\n"; + last; + } + elsif (@$comp_cigar[$index + $cigar_offset + $new_cigar_offset] eq 'D'){ # deletion in the read sequence + $new_cigar_offset += 1; # the composite cigar string does no longer match the methylation call index + $new_pos_offset -= 1; # we need to subtract the length of deleted bases to get the genomic position + # warn "adjusted genomic position by another -1 bp (deletion)\n"; + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + } + else{ + die "The CIGAR string contained undefined operations in addition to 'M', 'I' and 'D': '@$comp_cigar[$index + $cigar_offset + $new_cigar_offset]'\n"; + } + } + # print "new cigar offset: $new_cigar_offset\tnew pos offset: $new_pos_offset\n"; + return ($new_cigar_offset,$new_pos_offset); +} + +sub print_individual_C_methylation_states_single_end{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$cigar) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + my $methyl_CHG_count = 0; + my $methyl_CHH_count = 0; + my $methyl_CpG_count = 0; + my $unmethylated_CHG_count = 0; + my $unmethylated_CHH_count = 0; + my $unmethylated_CpG_count = 0; + + my $pos_offset = 0; # this is only relevant for SAM reads with insertions or deletions + my $cigar_offset = 0; # again, this is only relevant for SAM reads containing indels + + my @comp_cigar; + + if ($cigar){ # parsing CIGAR string + + ### Checking whether the CIGAR string is a linear genomic match or whether if requires indel processing + if ($cigar =~ /^\d+M$/){ + # warn "See!? I told you so! $cigar\n"; + # sleep(1); + } + else{ + + my @len; + my @ops; + + @len = split (/\D+/,$cigar); # storing the length per operation + @ops = split (/\d+/,$cigar); # storing the operation + shift @ops; # remove the empty first element + # die "CIGAR string contained a non-matching number of lengths and operations: id: $id\nmeth call: $meth_call\nCIGAR: $cigar\n".join(" ",@len)."\n".join(" ",@ops)."\n" unless (scalar @len == scalar @ops); + die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); + + foreach my $index (0..$#len){ + foreach (1..$len[$index]){ + # print "$ops[$index]"; + push @comp_cigar, $ops[$index]; + } + } + } + # warn "\nDetected CIGAR string: $cigar\n"; + # warn "Length of methylation call: ",length $meth_call,"\n"; + # warn "number of operations: ",scalar @ops,"\n"; + # warn "number of length digits: ",scalar @len,"\n\n"; + # print @comp_cigar,"\n"; + # print "$meth_call\n\n"; + # sleep (1); + } + + ### adjusting the start position for all reads mapping to the reverse strand + if ($strand eq '-') { + + if (@comp_cigar){ # only needed for SAM reads with InDels + @comp_cigar = reverse@comp_cigar; # the CIGAR string needs to be reversed for all reads aligning to the reverse strand, too + # print @comp_cigar,"\n"; + } + + unless ($ignore){ ### if --ignore was specified the start position has already been corrected + + if ($cigar){ ### SAM format + if ($cigar =~ /^(\d+)M$/){ # linear match + $start += $1 - 1; + } + else{ # InDel read + my $MD_count = 0; + foreach (@comp_cigar){ + ++$MD_count if ($_ eq 'M' or $_ eq 'D'); # Matching bases or deletions affect the genomic position of the 3' ends of reads, insertions don't + } + $start += $MD_count - 1; + } + } + else{ ### vanilla format + $start += length($meth_call)-1; + } + } + } + + ### THIS IS THE CpG and Non-CpG SECTION (OPTIONAL) + + ### single-file CpG and other-context output + if ($full and $merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + if ($cigar and @comp_cigar){ # only needed for SAM alignments with InDels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition+index: ",$start+$index,"\t"; + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM entries with InDels + # print "index: $index\tmethylation_call: $methylation_calls[$index]\tposition-index: ",$start-$index,"\t"; + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{other_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.'){} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + elsif ($merge_non_CpG) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{other_c}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } + + ### THIS IS THE 3-CONTEXT (CpG, CHG and CHH) DEFAULT SECTION + + elsif ($full) { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n" unless($mbias_only); + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{CHG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{CpG_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{CHH_context}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The read had a strand orientation which was neither + nor -: $strand\n"; + } + } + + ### strand-specific methylation output + else { + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start+$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + + if ($cigar and @comp_cigar){ # only needed for SAM reads with Indels + my ($cigar_mod,$pos_mod) = check_cigar_string($index,$cigar_offset,$pos_offset,$strand,\@comp_cigar); + $cigar_offset += $cigar_mod; + $pos_offset += $pos_mod; + } + + if ($methylation_calls[$index] eq 'X') { + $counting{total_meCHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'x') { + $counting{total_unmethylated_CHG_count}++; + print {$fhs{$filehandle_index}->{CHG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'Z') { + $counting{total_meCpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'z') { + $counting{total_unmethylated_CpG_count}++; + print {$fhs{$filehandle_index}->{CpG}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CpG}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq 'H') { + $counting{total_meCHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'+',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{meth}++; + } + elsif ($methylation_calls[$index] eq 'h') { + $counting{total_unmethylated_CHH_count}++; + print {$fhs{$filehandle_index}->{CHH}} join ("\t",$id,'-',$chrom,$start-$index+$pos_offset,$methylation_calls[$index]),"\n" unless($mbias_only); + $mbias_1{CHH}->{$index+1}->{un}++; + } + elsif ($methylation_calls[$index] eq '.') {} + elsif (lc$methylation_calls[$index] eq 'u'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "The strand information was neither + nor -: $strand\n"; + } + } +} + + + +sub print_helpfile{ + + print << 'HOW_TO'; + + +DESCRIPTION + +The following is a brief description of all options to control the Bismark +methylation extractor. The script reads in a bisulfite read alignment results file +produced by the Bismark bisulfite mapper and extracts the methylation information +for individual cytosines. This information is found in the methylation call field +which can contain the following characters: + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ~~~ X for methylated C in CHG context ~~~ + ~~~ x for not methylated C CHG ~~~ + ~~~ H for methylated C in CHH context ~~~ + ~~~ h for not methylated C in CHH context ~~~ + ~~~ Z for methylated C in CpG context ~~~ + ~~~ z for not methylated C in CpG context ~~~ + ~~~ U for methylated C in Unknown context (CN or CHN ~~~ + ~~~ u for not methylated C in Unknown context (CN or CHN) ~~~ + ~~~ . for any bases not involving cytosines ~~~ + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The methylation extractor outputs result files for cytosines in CpG, CHG and CHH +context (this distinction is actually already made in Bismark itself). As the methylation +information for every C analysed can produce files which easily have tens or even hundreds of +millions of lines, file sizes can become very large and more difficult to handle. The C +methylation info additionally splits cytosine methylation calls up into one of the four possible +strands a given bisulfite read aligned against: + + OT original top strand + CTOT complementary to original top strand + + OB original bottom strand + CTOB complementary to original bottom strand + +Thus, by default twelve individual output files are being generated per input file (unless +--comprehensive is specified, see below). The output files can be imported into a genome +viewer, such as SeqMonk, and re-combined into a single data group if desired (in fact +unless the bisulfite reads were generated preserving directionality it doesn't make any +sense to look at the data in a strand-specific manner). Strand-specific output files can +optionally be skipped, in which case only three output files for CpG, CHG or CHH context +will be generated. For both the strand-specific and comprehensive outputs there is also +the option to merge both non-CpG contexts (CHG and CHH) into one single non-CpG context. + + +The output files are in the following format (tab delimited): + + + + +USAGE: methylation_extractor [options] + + +ARGUMENTS: +========== + + A space-separated list of Bismark result files in SAM format from + which methylation information is extracted for every cytosine in + the reads. For alignment files in the older custom Bismark output + see option '--vanilla'. + +OPTIONS: + +-s/--single-end Input file(s) are Bismark result file(s) generated from single-end + read data. Specifying either --single-end or --paired-end is + mandatory. + +-p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end + read data. Specifying either --paired-end or --single-end is + mandatory. + +--vanilla The Bismark result input file(s) are in the old custom Bismark format + (up to version 0.5.x) and not in SAM format which is the default as + of Bismark version 0.6.x or higher. Default: OFF. + +--no_overlap For paired-end reads it is theoretically possible that read_1 and + read_2 overlap. This option avoids scoring overlapping methylation + calls twice (only methylation calls of read 1 are used for in the process + since read 1 has historically higher quality basecalls than read 2). + Whilst this option removes a bias towards more methylation calls + in the center of sequenced fragments it may de facto remove a sizable + proportion of the data. This option is highly recommended for paired-end + data. + +--ignore Ignore the first bp from the 5' end of Read 1 when processing the + methylation call string. This can remove e.g. a restriction enzyme site + at the start of each read or any other source of bias (e.g. PBAT-Seq data). + +--ignore_r2 Ignore the first bp from the 5' end of Read 2 of paired-end sequencing + results only. Since the first couple of bases in Read 2 of BS-Seq experiments + show a severe bias towards non-methylation as a result of end-repairing + sonicated fragments with unmethylated cytosines (see M-bias plot), it is + recommended that the first couple of bp of Read 2 are removed before + starting downstream analysis. Please see the section on M-bias plots in the + Bismark User Guide for more details. + +--comprehensive Specifying this option will merge all four possible strand-specific + methylation info into context-dependent output files. The default + + contexts are: + - CpG context + - CHG context + - CHH context + +--merge_non_CpG This will produce two output files (in --comprehensive mode) or eight + strand-specific output files (default) for Cs in + - CpG context + - non-CpG context + +--report Prints out a short methylation summary as well as the paramaters used to run + this script. + +--no_header Suppresses the Bismark version header line in all output files for more convenient + batch processing. + +-o/--output DIR Allows specification of a different output directory (absolute or relative + path). If not specified explicitely, the output will be written to the current directory. + +--samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified + explicitly if Samtools is in the PATH already. + +--gzip The methylation extractor files (CpG_OT_..., CpG_OB_... etc) will be written out in + a GZIP compressed form to save disk space. This option does not work on bedGraph and + genome-wide cytosine reports as they are 'tiny' anyway. + +--version Displays version information. + +-h/--help Displays this help file and exits. + +--mbias_only The methylation extractor will read the entire file but only output the M-bias table and plots as + well as a report (optional) and then quit. Default: OFF. + + + +bedGraph specific options: +========================== + +--bedGraph After finishing the methylation extraction, the methylation output is written into a + sorted bedGraph file that reports the position of a given cytosine and its methylation + state (in %, see details below). The methylation extractor output is temporarily split up into + temporary files, one per chromosome (written into the current directory or folder + specified with -o/--output); these temp files are then used for sorting and deleted + afterwards. By default, only cytosines in CpG context will be sorted. The option + '--CX_context' may be used to report all cytosines irrespective of sequence context + (this will take MUCH longer!). The default folder for temporary files during the sorting + process is the output directory. The bedGraph conversion step is performed by the external + module 'bismark2bedGraph'; this script needs to reside in the same folder as the + bismark_methylation_extractor itself. + + +--cutoff [threshold] The minimum number of times a methylation state has to be seen for that nucleotide + before its methylation percentage is reported. Default: 1. + +--remove_spaces Replaces whitespaces in the sequence ID field with underscores to allow sorting. + + +--CX/--CX_context The sorted bedGraph output file contains information on every single cytosine that was covered + in the experiment irrespective of its sequence context. This applies to both forward and + reverse strands. Please be aware that this option may generate large temporary and output files + and may take a long time to sort (up to many hours). Default: OFF. + (i.e. Default = CpG context only). + +--buffer_size This allows you to specify the main memory sort buffer when sorting the methylation information. + Either specify a percentage of physical memory by appending % (e.g. --buffer_size 50%) or + a multiple of 1024 bytes, e.g. 'K' multiplies by 1024, 'M' by 1048576 and so on for 'T' etc. + (e.g. --buffer_size 20G). For more information on sort type 'info sort' on a command line. + Defaults to 2G. + +--scaffolds/--gazillion Users working with unfinished genomes sporting tens or even hundreds of thousands of + scaffolds/contigs/chromosomes frequently encountered errors with pre-sorting reads to + individual chromosome files. These errors were caused by the operating system's limit + of the number of filehandle that can be written to at any one time (typically 1024; to + find out this limit on Linux, type: ulimit -a). + To bypass the limitation of open filehandles, the option --scaffolds does not pre-sort + methylation calls into individual chromosome files. Instead, all input files are + temporarily merged into a single file (unless there is only a single file), and this + file will then be sorted by both chromosome AND position using the Unix sort command. + Please be aware that this option might take a looooong time to complete, depending on + the size of the input files, and the memory you allocate to this process (see --buffer_size). + Nevertheless, it seems to be working. + +--ample_memory Using this option will not sort chromosomal positions using the UNIX 'sort' command, but will + instead use two arrays to sort methylated and unmethylated calls. This may result in a faster + sorting process of very large files, but this comes at the cost of a larger memory footprint + (two arrays of the length of the largest human chromosome 1 (~250M bp) consume around 16GB + of RAM). Due to overheads in creating and looping through these arrays it seems that it will + actually be *slower* for small files (few million alignments), and we are currently testing at + which point it is advisable to use this option. Note that --ample_memory is not compatible + with options '--scaffolds/--gazillion' (as it requires pre-sorted files to begin with). + + + +Genome-wide cytosine methylation report specific options: +========================================================= + +--cytosine_report After the conversion to bedGraph has completed, the option '--cytosine_report' produces a + genome-wide methylation report for all cytosines in the genome. By default, the output uses 1-based + chromosome coordinates (zero-based cords are optional) and reports CpG context only (all + cytosine context is optional). The output considers all Cs on both forward and reverse strands and + reports their position, strand, trinucleotide content and methylation state (counts are 0 if not + covered). The cytsoine report conversion step is performed by the external module + 'bedGraph2cytosine'; this script needs to reside in the same folder as the bismark_methylation_extractor + itself. + +--CX/--CX_context The output file contains information on every single cytosine in the genome irrespective of + its context. This applies to both forward and reverse strands. Please be aware that this will + generate output files with > 1.1 billion lines for a mammalian genome such as human or mouse. + Default: OFF (i.e. Default = CpG context only). + +--zero_based Uses zero-based coordinates like used in e.g. bed files instead of 1-based coordinates. Default: OFF. + +--genome_folder Enter the genome folder you wish to use to extract sequences from (full path only). Accepted + formats are FastA files ending with '.fa' or '.fasta'. Specifying a genome folder path is mandatory. + +--split_by_chromosome Writes the output into individual files for each chromosome instead of a single output file. Files + will be named to include the input filename and the chromosome number. + + + +OUTPUT: + +The bismark_methylation_extractor output is in the form: +======================================================== + + +* Methylated cytosines receive a '+' orientation, +* Unmethylated cytosines receive a '-' orientation. + + + +The bedGraph output (optional) looks like this (tab-delimited; 0-based start coords, 1-based end coords): +========================================================================================================= + +track type=bedGraph (header line) + + + + + +The coverage output looks like this (tab-delimited, 1-based genomic coords): +============================================================================ + + + + + +The genome-wide cytosine methylation output file is tab-delimited in the following format: +========================================================================================== + + + + +This script was last modified on 25 November 2013. + +HOW_TO +} diff -r 0f8646f22b8d -r fcadce4d9a06 readme.rst --- a/readme.rst Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -=============== -Bismark Wrapper -=============== - -Bismark_ uses Bowtie or Bowtie2 to map bisulfite converted sequence reads to a reference genome and determine cytosine methylation states. - -Publication: http://www.ncbi.nlm.nih.gov/pubmed/21493656 - -User Guide: http://www.bioinformatics.babraham.ac.uk/projects/bismark/Bismark_User_Guide_v0.7.12.pdf - -.. _bismark: http://www.bioinformatics.babraham.ac.uk/projects/bismark/ - -Preparation -=========== - -Create your reference index with *bismark_genome_preparation* in your normal Galaxy Bowtie2/Botwie index directory. It will create a Bisulfite_Genome folder directly in your Bowtie2/Bowtie index directory. -If you follow that approach you do not need to specify or modify an extra .loc file. -That wrapper will extract the path to the Bisulfite_Genome folder from ./tool-data/bowtie2_indices.loc or ./tool-data/bowtie_indices.loc. - -======= -History -======= - -- v0.7: Initial public release -- v0.7.8: update and add Tool Shed Integration -- v0.7.11.1 change default output to BAM, from now on samtools are required -- v0.7.11.2 added multi-threading to samtools (samtools > 0.1.19 is required) -- v0.7.12 upgrade to bismark 0.7.12 and fix a major slowdown -- v0.7.12.1 define a dependency to samtools 0.1.19 - - -=============================== -Wrapper Licence (MIT/BSD style) -=============================== - -Permission to use, copy, modify, and distribute this software and its -documentation with or without modifications and for any purpose and -without fee is hereby granted, provided that any copyright notices -appear in all copies and that both those copyright notices and this -permission notice appear in supporting documentation, and that the -names of the contributors or copyright holders not be used in -advertising or publicity pertaining to distribution of the software -without specific prior permission. - -THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL -WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE -CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT -OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -OR PERFORMANCE OF THIS SOFTWARE. - diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_output_result1.bam diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_output_result2.bam diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_report_result1.txt diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_report_result2.txt diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_stdout_result1.txt diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_stdout_result2.txt diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_suppressed_reads_l_result2.txt diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_suppressed_reads_r_result2.txt diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_unmapped_reads_l_result2.txt diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bowtie1_singles_unmapped_reads_r_result2.txt diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bwa-mem-fastq1.fq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/bwa-mem-fastq1.fq Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,396 @@ +@M01368:8:000000000-A3GHV:1:1101:6911:8255/1 +ATCTGGTTCCTACTTCAGGGCCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATT ++ +BCCCCFFFFFFFGGGGGGGGGGGHHHHGHGHHHHHHHHHGGGGGGHHHHGHHHHHHHHHHGHHHHHHGGHGGHHHGHHHHFHHGHHHHHHHHHGHEHEFFGHHEGGCEFGGFHHHBGHHGHHHHGHFHHHGHGHGHGGCDFDDACGGGGGGGAAFFFFFFFFFBAFFFFFB;FFFFFFADDFFFFFFFFFFEFFFFFFFFFFBFFFFFFFFFFFFFFEFFFFFFFFBFEFFFFEFE;DFFFDFBFF/9BFB +@M01368:8:000000000-A3GHV:1:1101:14518:9998/1 +GTTATTATTATGTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGGGATAGACCTGTGATCCATCGTGAT ++ +AAAAAFFFFFFFGGGGGGGGGGHGGHHHHGHHHHHHHGCGHHHHHHHHHHHHHHHGGGGGHHHHHHHHHGHHGFHFE5BGEEHFGGGHHHHHHHHFBHHGGGGFHGHHFGHHHHGHHHHHHGEGGGGFHFHGEGHHGGCDGDGHGGGDGGHGGCGGGHGHHH/ACDG?.1FGCDCCGCA.CC@CDCHFHGFFGGGEBFGAB//CEFBFGG.:;D;;A0AFFFFFB..:@ABFF//;BFFFFFBF/9D:A// +@M01368:8:000000000-A3GHV:1:1101:18422:19051/1 +GTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACG ++ +CCCCCFDDDDDFGGGGGGGGGGHHHHHHHHHHHHHHHHGHHHHHHFHHHHGGGGHHHHHHHHHGHHHHHHHHHHHHGGHGGHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHGCGGGHHHHHHHHHHHHHHHHHHHHHHGFDHGFHCFGGGGFGGFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF;FFFFFFFFFFFFFFFFFFFFFFFFFFFFEFBFFFFFFFFFF:FFF. +@M01368:8:000000000-A3GHV:1:1101:25545:21098/1 +ATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATAAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGGCTTATTTAAGGGGAACGGGTGGGG ++ +33AA?DFD5BDFGGGFEBDGEGHEGHGEGHCEGGHHCHGHHFFHHGFGAGE53FF2FAFFGDE5FFFE5GFBFGAEE1GHHHGHHHEHE3FGHF@GEGEGGHHGG3FAGFFDE?EEE3GFEGFGFGGCG?GHHHFHGGGC@DHFFHD/AA@AAAAD2ADDFFGGGGGFGGHA?EAEFBEAGHFABFGG5FDF5DB1EEGAFDFB53FF5FH@G5FFEHGHEFHFFHBE333GF43GCGGGGE@0?BFGGB0B?FHGFDGGHHHBFFDEGGHGFFFDFE@<1>@FFFGHHHHFHEFGDABFFGG/@DCE---;--..;.../9 +@M01368:8:000000000-A3GHV:1:1103:25519:6301/1 +AAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGCATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGCTGTCTCTTATACACATCTCCGAGCCCACGAGACTAAGGCGAATTTCGT ++ +3A3AAFFFFBFFGGG2CF?4FGHGHHCHGGHHBHGFFAGGHHFFHHCGHFEGCECEGHFFFHGHHGFGHFGDAFDGFFEEFEGGBECC/;>=EF?AEFFFF;9;=DAFFFFBBFFFBFFF//;BFB.:FBFFBB/BFFFF/BFFE?EFAFEF9 +@M01368:8:000000000-A3GHV:1:1104:11928:24197/1 +ACGTAGGTGCGATAAATAATGGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACCAGATGTCGGATA ++ +CDDCCFCFFFBCGGGGGGGGGGHHHGHHHGGGGGHHHHHHHHHHHHHHHHHHGGGGGHHHHGGGHHHGGGGGGHHHGGGGHHGGGGHHHHHHGGGGGGHHHHGGGGGGGHGGGGGGHHHHHHHHHGHHHHGHHGHHHHGGHGEEGGGGFHHFHHHGHGEHHHHHHHHHHHHHHHHHGGGGGGFGGGGGGGGFFGGGFGFGAEFEFGFEAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF@D9 +@M01368:8:000000000-A3GHV:1:1104:12288:20635/1 +CACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGCCGCAGCATCTGTCTTTGATTCCTGCCT ++ +ABA?AFBAB2>AFGGGGFFGD5GGHGFFHGHEGGGGHHHHFFHH5FHGHHHFFHEFHHHHHHGGHHHHHGFHGGHGGGGFHGGGFBHHHFGEGHHHHGHGGGGECEHHFHHGHHFFHFGGGHHHEGHDGHHGHGHFHHHECGCBCHHGFHHFHD1GFHEFHEF?GDCFFEFE0A@;@>BD/9BFBBBBFDFFFF99D=?>DDBFF-AFFEBB9BBEFEFAFFF//-@---;;FBBFBFB///;99999BA9 +@M01368:8:000000000-A3GHV:1:1104:13832:4398/1 +CACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCCTGTCTCTTATACACATCTCCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTGACAAAAAAAAACACACAACAACCAACACCCAGTAGACTAGTGCGTC ++ +AAABABCAFFFFGGGGGGGGGGHFHHHHHHHHHGFFHHGGHGGGGGHHHHHHHHHHFHHHHGHHHHHGGHGHGGHGGGGGGGHGFFGGGGDGGHHHHGGGGFGGGGGCFGGEGCDGGGGGGGGGGGGEGF?;BFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFF/FFFFFFC.;BFEAFFFFF;@FFBFFFFEF?/FFFFE----:...:...9....9.9...:.9//9////;/9-.- +@M01368:8:000000000-A3GHV:1:1104:23670:4161/1 +ATTCAATTGTTATTATTATGTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCC ++ +AAAABFFFFFFFGGGGGGGCGBBGGHHHHHHHHHGHHHHHHHHHHFGGHHHHHHGGFHHHHHHHGGGGGFHHHGHHHHHGHHHHGHHGGGHGGGGHHHHHGHHHHHHEGGGFHHHGFFDGEGHGHFHGHHGEGGGHHBGHGGHGHGGGGG/@FCDFDGFHGGGDFFHFGFFGGGGGHHHHGGGGGGGEGCDG?GGBFFFEE00BFFGABBBAFGGGEGEA.9FFF/./;BFBF./AAF/BFFBBF/F/BFB +@M01368:8:000000000-A3GHV:1:1104:24927:22253/1 +AGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTAT ++ +AAA?AFFF3DFFFGGBGFF3EAEEEHFHHFHHHGHHHHDGHHGAEFGHHHGFCHHGHHHBGHHHAEGGHGGFHFBFHHHHHHHHHHFHH1EEHEEHH1FFDEEFCGHBGHHFHEHHHHHH1FCGHHHHEEFGEEGFFCHHH11>GHHF/FG2DGGCGAGGHGGGG@@@BEFFF?@@@FFEFFFFFF?@@FB@BB@FFFE=;==---A9EFBBB/;E;A9B/B/FFFEF/B///;/;9/A/;/9BFFFBFBB +@M01368:8:000000000-A3GHV:1:1104:4705:7727/1 +CTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGG ++ +BBBBBFFBFFFFGGGGGGGGGGHHHGHHHGHHHHH6GHHHHFFGGGHHHHHHHHHHHHHHHGGFGGGEGGGHHHHHHHHHHGHE3FGGGGHHHHHFHHHHHHHHHHHHGGGGGHHFHFCGFHHGGGGGGGHHGGGGHHGGGGGGHHHHGGGADCHHHHGDGGGGGGGGC@DGFHEHHHHHHGHHHHGFGDEGFGFFGGGGCFGBEGFGGGFG;C.BFFFFFFFBFBFFFF/F?.AFFF/BFFEFFFFFEFF +@M01368:8:000000000-A3GHV:1:1104:7756:4776/1 +GGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGA ++ +CCCCCCFFFFFFGGGGGGGEEFHHHHGHGHHHHHHGGHHHGGGGGGHHGHHHHHGHHGHHHGGFHHHHHHHHGHHHFHHGHHGFGGGGHHHHHHHHGHGHFFHFHHHHHHGGHGFGFHHHGHG?FGHHHHHHHHHHHHHHGHGHHGHGGCDGFFHHHHGHHHHHGHHHGFHGHFHGHGHGH/@CABFFFFFFFACAFF?BBFFBFD-@-.D>FFAF.F.:9---;AADFFFFFBEFFF:9FFFFBBFFFF/ +@M01368:8:000000000-A3GHV:1:1105:11310:21761/1 +ATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACCAGATGTCGGATACAGTTCACTTT ++ +CCCCBFFFFFFFGGGGGGGGGGGHHHHHHHHHHHHGHHHHHGGGGGHHHHHGGHHHGGGGGGHHHGGGGHHGGGGHHHHHHGGGGGGHHHHGGGGGGGHGGGGGGHHHHGHHHGHHHGHGHHGHHHHGGHGGG=FFGEHHHHHGHFGGHHGHHHHHFHHHHHHHGGHHHHHHHGHEHHGHHHHHGGGGF9AGE?DEGFFGGGGGGGGGGGGGGFFFF/9FFBFFFFFFFEFFFBFFFFFFFFFFFFEFFF0 +@M01368:8:000000000-A3GHV:1:1105:11578:14676/1 +ATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACCAGATGTC ++ +CDDDDFFFBFFCGGGGGGGGGGHHHHHHHHHHHHGGGGGHHHHHHHHHHHHHHHHHHGGGGGHHHHFGGHHHGGGGGGHHHGGGGHHGGGGHHHHHHGGGGGGHHHHGGGGGGGHGGGGGGHHHHHHHHHHHHHHHHHGHHHHGGGGGGHGGHGHHHHHHH;CHHHHHHHHHHHHHHHGHGGGGFGGGFGGGGGGGGGGGGFGGGEGGFFFFFFFFFFFFFFFFFFFFFFFFF/FFFFFFFFFFFFFFFF0 +@M01368:8:000000000-A3GHV:1:1105:13575:19255/1 +TTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGTTCCCGTGAGTGGTTAATAGGGTGATAGGCCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATG ++ +AA1A1DFFFFFBGG3A111FFAEGDDBGFDHF00F01BBA00BFGEHFBFFA1FABB0FAE?CFHF1BGFHFB0011F121E//>>1F10>B?EFHGCC/?/BFG/EEGFFEE//BCFDE0?E@@B/?11AAFFFFFEAGGGFGGAA6FHDGGCGFHFFHFHFHHHHHGDDDFFFHGFFHGBEGEGHHHFFFHGEEAEGHHHHHFHHHFHGHFEDGGEGBEAGEFHHFFHHHHHHGGHHHGHHGBGHHFHHFGHHHHG?CG?AFHH1FHHHGHFHHHF?DGHFFHCGAGEFHGCBBBB?BFFF?B-B.9/BFFFFFB=-AD?..@FFFF@@-@@?AFD.FFFBBEEBD9/:FFFFFBB/:FF9BF/BF.BBFF/: +@M01368:8:000000000-A3GHV:1:1105:21099:2895/1 +TAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTT ++ +CBCBBFFFFFFFGGGGGGGGGGGGGFHHHHFHGHHHHHHFHHHHFGFHHHHHHGHHHHHHGHHEHGGGGHHHHHHHHHHHHHHHHHHHHHHGGHGGHHHHHHHHFHHHHGHGHHHGGHHHHHGGHFFGHGCGGGHGHHGHHHFGHHFHH/FGGGHFGFEGGCFCCD;B@DFFFFFFFFFFFFFFFFFFFFFFFFF;F.BEFADFFFF.AEFFFFFFE;D.;:B00099:09BFBFFBFF.0:000BFFBF0 +@M01368:8:000000000-A3GHV:1:1105:26469:19245/1 +TTATGATTATGTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCAAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATG ++ +BAAAAFFFBFFFGGGGGGFFGGHHHHHHHHHHHHGGHHGHHHGGHHHHHHHHHHHDEGGHGHHHHHGEDHFHGFHGGHHGGHGGEHHHGHHGHHGHHHGGEGGGHGHGHHHHHHGHHHHGHGGGGGHFHGGGGHHGGGGCDGHHHGD>A@FFFFFFFGGGFEECFGGFFB5GBHH5DFH5GBEEAEFHFHHF3F3GHFHHHEF3GAEHBGFHFFEFFHHHHFFB31FGGEGFHGEHHHHHHHHHHHHHHHHHEEEEGHHHFFF3GF3FGF44BBGEHHHHGG2B/BGGHE;B=.9.;.BBFFF..;A;/B/;B//;;//BBF/ +@M01368:8:000000000-A3GHV:1:1106:22927:3863/1 +GTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGACTTCTGTCTCTTATACACATCTCCGAGCCCACGAGACTAA ++ +A>ABAFA?D@AFGGGGFGGGFGFHHHFFH4BCBGHHHHGFFF3G5BF3322EEAFHHHHHHAB5GHFHEHFHHH3BEEHFGF5@13FDF3GDGFHHFGHHHGHHHGFGGHHFCG//?/EGHHHG3F4BFFGHHHEDGFHHB/B/?FBF<@-9?BDEFFF?B--?DFFFB9FA>B;A.BFDB?AB.--@;-.FFAEFFFF9AE;DDFFFFB/9BFFFFFFFB:BF/BB/BFF//FFF-=--.99=--.9/// +@M01368:8:000000000-A3GHV:1:1106:23625:4236/1 +TAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTA ++ +@?AAAFFBFBFFFGGFGFGGGEFCGGHHHHFGHGHFHGHHHHGGFGGFGHHGHGHHHHGHGFFHHGGGGGHHHHHHHHHBFHHEFHHHHHHGF?FAGHGFHHHGDGFHHGGGHGHFHHHGHHHGHHFHHGFFGFHHHGHHHHFFHHHHH/FGHHHHGHGHGDFFFE?FFF/--;;9/:BFFFBA///9----;B.//////;...---9--- +@M01368:8:000000000-A3GHV:1:1106:6455:17124/1 +GTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACG ++ +BBBAAFABDBBFGGGGGGGGGGHHHGHHGHHHHHHHHHGHHGHHGHHHGHGGGEGHHHHHHHHHHHHHGHHHGHHHGGHGGHHHHHHHHHHHHHHHHHHGHHHHHHHHGHHHHHGGCFEHHHGFHHHHGHHFFH?GGHHHHGHGHGHHCFGGGGGGGFFFFFFFFFFFFFFFFFFFFADFA?.FEFAFFFEFFFFFFFFFFFFDABFBBFFFFFFFFEFFBFFFFFFFFFFF/FFBFFFFFFADE>BFFF9 +@M01368:8:000000000-A3GHV:1:1107:14451:26748/1 +GTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGA ++ +BABBBFFDABBBGGGGGGGGGGHGHHHHGAGHGGGGGHHHHHHHHGHHHHHGGGGGGFFHHHHHHHHHFHHHHHGGGGCHFHH3FEHHGCGGGDGHHHGFGGHHGGGGHHHHHHG/DGGGHHHHGGGGGCCHGG?CGG1GHHGHHHHBGHHHFHHGHHHHGGADAD00ECEGHEFHHFFAEFFFFFGGFBFBFFFGFFG.;B/99;BCFGGGGEFFFFFFEDA./:ADEB?FBFFFF;FFEBFBFFB./9. +@M01368:8:000000000-A3GHV:1:1107:18714:24788/1 +GTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGA ++ +BBBCCFFFDEDEGGGGGGGGGGHHHHHHGGHHGGHGGHHHHHHHHHHHHHHGGGGGHHHHHHHHHHHHHHHHHHGGGGGHHHHCFGHHHGGGGGGHHHGGGGHHGGGGHHHHHHGGGGGGHHHHGGGGGGGHGGGGGGHHHHHGHHHHHHHHHHHGHHHHGGEGEGCCEGGHHHHHHG:CAFFGGGGGGBFEFGGGGHGAFFGGFGGGGGGGEFFFFFFFFFEF/EDDFFFFFFFFFBFFFFFBBFFBBF/ +@M01368:8:000000000-A3GHV:1:1107:21355:8333/1 +ATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACCAGATG ++ +ABCBBFFFFFAFGGGGGGGGGGHHHHHHHHFHHHHHGGGGGHHHHHHFHHHGHHHHHHHGGGGGHHHHFFGHGHGGGGGGGHGGEGGGHGGGGHHHHHHGGGGGGHHHHGGGGGGGHGGGGGGHGHHHFHHHHHHHHGHFFHHHHGGHGEGHHGHGFHHFHGHCEFHEGHGHHHGHHHGGGGGGCFFFFBFGGFGGGGFBG?FDGGGEGFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFF.AA/9 +@M01368:8:000000000-A3GHV:1:1107:22864:15712/1 +AATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACCTGTCTCTTATACACATCTCCGAGCCCACGAGACTAAGGCGAATCTCGTATG ++ +BBBBBFBBFFFFGGGGCEGGGGHGHHDGFHHHHHHHHHHHHHCHHGHFGBGDFGE?GHHGHFBHHFHHHHHHHHHGHFGFGHGHHGGC@DCFGHHGGGGGGGHHHHHHGGGGGFGGGGFFGGGGFADGFEFFFFFFFFFFF9;FFFF/BFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFEAECDFFFFFFFFFBFFFFFFFFFFFFFFEFBFBFFF/FFFDF-;-9;E@>-9DFFF/FFBF;-.:;.9A// +@M01368:8:000000000-A3GHV:1:1107:24817:14233/1 +CTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACAT ++ +3AABABFFBFFFGCFGGGGGGGHHHFGCFGGGFFGHHCGHHHGHGHGHBFFHHHEDGGFGFGHHFFFHGGEGHFHHGHFHGFFHHHFGHHHFHACECEHHHBGFHGFHHHFHHF4GFHFHFHEHGGE/EG:A-CFGGFGADGGGGFGGGGDAAAADFA.-AFFECFDAAFFFFEFFFFBFF;D;BFFFB/;BFFFFFFFFFFFFFFBFFBF/;//;BFFFF?FFFFFFEDAFFFFFBF//:BFBACF--9/ +@M01368:8:000000000-A3GHV:1:1107:24830:14440/1 +AACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACC ++ +ABBBA@FFFFBBGGGGGGGGGGHGHHGHHGHHHGGHHHHHGHGHHGDHHHHHHAGGGGFHHGHHHHHHGHHHHHHHHHHHEGAFGFHFHHHHGEGHHHHHHHHHHHHHHHHHGGHHHGEFEGGHHGHHHHHGGHGFHH/?GHHHHGFGHGFHCGGBBDAFFFEFFFFFFFFFFFFFF=CDDADDFFFFEF=BFFFEFFFBFFFFEEBF./:FFFFFFFFFFFFFFFFFFFEFFFFFFFFFFF/BFBEFFC- +@M01368:8:000000000-A3GHV:1:1107:26478:23602/1 +GCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATCTGTCTCTTATACACCTCTC ++ +1AAAAF1B13BBBEGGGFGGGGGGHHDB1BGFHHHFGFGGG0F00BGBGFGGF1GBGB1BGCGAEE/FGFHGHFHHHHHBFF1B0FGGF//F//FGFGFFF/1@B1DFHHHGGEFEDHFDFAFF0B//1FDD1GFFHHDF?1BD>GF/GCAC0C@?@-;99BBB->@--;-B//99B--9-BF=A--A-:A----9AA9AFBFBBB?E@BBFFFFFFFBFFFFFBFFFBBFFF//B/////-;9 +@M01368:8:000000000-A3GHV:1:1108:11059:23181/1 +GTTATTATTATGTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGAT ++ +CCCCCFFFFFFFGGGGGGGGGGGHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHGHHGGGGHIIIHHHHHHHGHHHHGHHHGGHGGHHHHHHHHHHHHHHGGGGGHHGHHHHHHHHHHHHHHHGGGGGHHHHHFGHHHGGGGGGHHHGGGGGHGGGGHHHHHHGGGGGGHGGHEFGGGGGHGGG-ECHHHGGGGGGGGGFGFFGGGGFGGGCF?EFGG0;BFFFFB?9AAFFFFFFBF;BBFFBFFFFE?F/ +@M01368:8:000000000-A3GHV:1:1108:12233:13498/1 +TAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACCCTGTCTCTTATACAC ++ +BCCBAFFBCCCCGGGGGGGGGGHHHHGGGGGHHHHHHGHGHBGFHHHHHGGGGGHHGGBFGHHGGGGGGGHHHGGGGHHGGGFHHHHHHGGGGGGHGHHGGGGGGGGGGGGGGHHHHFHHHGHGHHHHHHGHGHHGGFGGGDFFGGHHHGHGH/FHHHFHGHGHHGHHHGHHBAAAA3BDFFFFDGBFFEEGGGGHHGHFHHCFHGHGHHHGF3BFGFHGFEFHFHGH3EGFC1EEGBFGBHFGGFFHHHGHFHGHHGHGGHCEHEBCFF3ED3@GH4FDEG12FFHFFHG/G3FGA/@EFEFCG33FFGBBGHFHH/?BFBGFGFAGC2C/@-;@-;BFFEFADF?DFFFFFF?;AFFA;AD;..FEFFA--;9;D.9BFFBE?;.;///://///99//BB/9;F./B//BBFB9////99/ +@M01368:8:000000000-A3GHV:1:1109:25237:11722/1 +CCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGCTGTCTCTTATACACATCTCCGAGCCCACGAGACTAAGGCGAATATCGTATGCCGT ++ +AABBBFFFFFBBF5BGGFGABGHGGAEFGFHHHHFHCHFHHBFGGGEHBGGHFFFFGFAEF5FHH?EGGF5AEG3?EBFHD?C/EFHFFEGEE?CFGGGHHFHB3EEFG?EEGHCFGGGGGGGECA?FFFDFBFGCG4EGFG??GACFGGHFHCFHCHBECDEFCEGHHHHHFFGGF3AFFHHFGEGCGHHGEHGFGHHEGG?EFHHH?EGGHHGGGGHHHHHHEGGGGGHFHHGFDDGGACFCC@?/FDDGHHHGHEFFHHH0FDHHFFEDCD.CFHHGDFGGGGGEFGC?EGHHHB0CGHHGCGHHHHBEHGHH.::ACA;FECFGGGGGFFFEFGGGGGGGFFBBBFGBFFFEFFFFBFFFFFFBBFBFFB9..AF?FFFFFFF9 +@M01368:8:000000000-A3GHV:1:1110:14751:26540/1 +TAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACTGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGTTGTGCACGCGATAGCATTGCGCGACGCTGGCGCCGGCGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTG ++ +AAA?1CFFFFADGEEGFGB1112DGDGHGFGHHCG/ECEBGGHDFGHHFHGHEB1BCGFFE2EGHHGHFFFHHBFFHEFCGGHHFHHHHH111BFFHGHEHBEHGGECEFHGFHFGGDBBGHHBHHHH0GH22FGHGFFFH1BFCG0F@FGG11@GDHFG1D/1FGFHH/FCG.>0><-<@CCFFG.CC-@C?C9FFF0C---;------;9----9-9@-AAFF9B9---9-/;B9BF/9BB//B/;/// +@M01368:8:000000000-A3GHV:1:1110:15519:10008/1 +CTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACC ++ +BBBBBFFFFFFFGGGGGGCBGGE?GGGHHHHHGHHBEGGHHGGGGGHHHHHHHHHHHGGHHHHGFGGGGHGHGCFGHHHGGGCGGHGHGGGGGHFGGGHHHHHHGCEFGGHHHHGGGGGGGHGG/CEGHHHHGFHHHGFHHGFFHGHGHFGGEGFDDGGBCGHHGFHH/CHHHHGHGHF0CFGGGGGGGFFFFFBFFFFFFGFFGGF?DGEDEEFEFFFF?FFFFFFFFFFFFFFFBFFF.BFFFBFBBA; +@M01368:8:000000000-A3GHV:1:1110:16559:7507/1 +CATTCAATTGTTATTATTATGTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAACTGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATC ++ +ABBBAFFFFFFFFGGGGGGGFGGGGAGHHGCHHGHHHHHHHHHHHGHHGHHHHHHHHHHHHHHHHFGGGHHHHHHHHBDHHHHHGGHFEEGHGGHHHHHHHGHHHEHFGGGGGGHHHF33FGHHG3GDDGH//EEGHHFFBF?GHHFCC@DGFCGGB?DGHAFGGHHGFH0E??<@DGHGGCFGGF?DGGGFFGGGGGGGG.C/0BBF?FBBEGG0FFF.AA9;B9/E////99;;:.//:FFFFBBF//9 +@M01368:8:000000000-A3GHV:1:1110:16700:6843/1 +AAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACCTGTCTCTTATACACATCTCCGAGCC ++ +BBBBBFFFFFFFGGGGGGGGGGHHHHGHHHHHHHHHHHHGGGGGHHHHHGGHGHGGGGGGHHHGFFGHHGGGGHHHHHHCGG@FGHFHFEGGGGGGHGGGGDGHHHHHHHHHHHHHHGHHGHGHHGGGGGCGHEHGHHHFHHHFF.DGHHHHHHGHHHHHHGFGHGCGHGHHHHHHHHGHHHCD?:A::FCDGEGFGFFBFGBFGGGF00CBFFFF/BFBFFFFAE.FBBFBFFFFB/BFFBFFF/ADCAD +@M01368:8:000000000-A3GHV:1:1110:18014:7856/1 +ATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGGGATAGACCTGTGATCCATCGTGATGCTGTCTCTTATACACATCTCCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTGCAAAAAAAAAATAG ++ +AABBBFFFFFFFCGGGCGGGGGGHHHHHGFHHHHHHGHHHGGGGGFHHGHFGGHHHGGFGGGHHGGGGGHHGGGGHGGHHHGGGGGGHHGHGDGGGGGGGGB/A99FE//BBFFF.9/FF?DA=@9./9 +@M01368:8:000000000-A3GHV:1:1110:23303:11846/1 +ACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGTGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTC ++ +3ABB?FAB?DAAGGGFBGGBB5AFFFGFHHGGGEGHHHFHHF3EGFHHHHEFHHGGHHHGH3EFHHHHHBDGHHE?ECHGHGHHHHHHFHHHGHHHHHHHGGHGGFHHHFHHGFFHHFEGFHHHHHHEHHFDAGHHHH/BDDFGGH1FDBGHHHHHHHCGHHHFH?0CGCF>CC=@--AFBFFB?=BFDFFF//FDB;-9A-AD-.A.A=?-9-:B/AEB/BFF.A;DDFFFBBFFFF/BBFFFFBFF9B; +@M01368:8:000000000-A3GHV:1:1111:13195:14562/1 +ATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACCAGATGTCGGATACCTGTCTCTTATACACATCTCCGAGCCCACGAGA ++ +ABAAAFFFFFFFGGGGGGGGGGGHHGG4FDFGFGGCGGGHGHG?EGHEGFFDHFHHHHGGFE@EHHGEEEGFCCEFFGFG/EHHHHHBFBGGHFHFHEHGHEFGGGED/EFDEFFEHHBGHHCFFFDFGGHHGFFFHHHHGFG>0<0DGGFHBGHFBDFGHGFFGGAGHHGCGBHFFHHHFFFBFGGGGF/:FFFCB9F9FF0FEFE?FF0BED-D.//BF9//BBFF/BFFFEFFFFED-DCFF?.:BFA +@M01368:8:000000000-A3GHV:1:1111:14287:20817/1 +GCCAACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTCATCGCACCTACGTTCACTATTACAGGCGAACATACTTATTAAAGTGTGTTAATTAAT ++ +>>A3>AAAACFFFGFGFGGGGGHGHHHHHHHGGGGGHGHFFCGHHGBGHFHHHHAGHHHHEHHAGFFEHEAE?GHHHHEGFFFDGHGHBFFHFHHHGHGHGHBECG@@@EHGGEGCGGGGFFGGGG?BGGGGGG.-C.FEDCFFFEFFFAF/99BEFA.:9BFB/9FFFEFFFFFFEFFF/FBBFFFFFFFFFFFB//;.@@//;DFFFF//BBFFFBF;A;DF=>B//;9B//B/;/;9//9//;B/:B9 +@M01368:8:000000000-A3GHV:1:1111:23624:14516/1 +GTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGGGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAG ++ +3>3AAFFFFFAFGGGGGGGGBGHHHHHHGGHGGHEHGHADFHHHHEGGGHBGHFFHHEHHGHFHGG5FBEEGEGHHHGHHHEFDGHHFFGFGGHHGHHHGHBHGHFHFHHGGGGFCFBGFGE?GFBECEGDGHEHGDGCGHGC/DF@GHGFFCGGGGFFFFDD?@C@GFEC?-ADHFFHBGHFBHFG0CC//.;B0/CA.;AA.;@.AB/9BB/A/.AFFFE/BFFF///9B/FBDE9/;99/B/BFFEF/ +@M01368:8:000000000-A3GHV:1:1111:7437:24508/1 +ACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTGTCTCTTATACACATCTCCGAGCCCACGAG ++ +CBCCDCFFFFFFGGGGGGGGGGGGGGHHHHHHHHHHHHHHHHGGGFFEHGGHHHHHHHHHHHGHHGGGGGHHHHHHFHHHHGHHHHGHGGGGGHHHHHGGHHHGGGGGGHHHGGGGHHGGGGHHHHHHGGGGGGHGHHGGGGGGGHGGGGGCGHHHHGHHGHHHHHHHEDHHGHGGGGGD;BEGGGFFGGGGC:CFGGGGGGGGGFGGGFFFGFFFFF;FFF;BFFFFFFFFFFFB:FFFFD=CAEEEF>- +@M01368:8:000000000-A3GHV:1:1111:8665:25542/1 +TCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTGTCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCG ++ +AA3AAFFFFFFFGGGFGFFGGGGHGH4AGGGFHHCGEE2FGHHHHHHHHHHGFHGF53FHEGGFAF5CFHHGGFGHFHHGGHHHCHHHHHHHFHHGH?EEEG1FFFEHHFHFGGHHGG?F4FBGHGHCHEF0//-9--;0CFGG?@BGDGFFFFEFBBBDA?FFBFFFF=B;-9.AA9EFFFF/;9BD.//;9/9/B:/:BFBFFBFFF/B:BFF//;/B/FFFBFF.A-BFFFBA/F//BFFFBF/;/A- +@M01368:8:000000000-A3GHV:1:1111:9655:13916/1 +GGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGCGGTGTCCACGCAATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGA ++ +1>>1AAFFFFFBGGFE1EEEEGGHGHHH2FGHHHFGFGHGCFGGGEFHGGEGGHHHHGHEFBFEGHHHGHHBEHHHHGHGFHHHGGGGHHEGGGGHHHGFHGHHHH1FHHG?G/AGF1GBFGEFFHF2FDD0FG--99@=---AAABA-BF/B?A@F-9BBFF//BB/9/ +@M01368:8:000000000-A3GHV:1:1112:10272:7828/1 +GTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCTGTCTCTTATACACATCTCCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTT ++ +AAABABBBBBBFGGGGGGGGGGHGHHGHGHHHHHHHHHGHHHGGHDGHHHGGGGHHHHHHHHHHHHHHHFHGHHHHGGHGGHHHFHHHHHCGHFFHHHHHHHHHHHHHGHGEFGAEEEEHHGHHGHGHHHHHHHGHHHHHHGHGHFFBC/BA?BGGGFFFFFFFFFFFFFFFC-@DDFFFFFFFEFAFFFFFFBFBFFFFFFFFFFFFFFFFBFF->@FFFFFFFFFFBFBFAD--.99;A?////:.9A. +@M01368:8:000000000-A3GHV:1:1112:9832:16531/1 +CTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGGGCTGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCTTGAAGTAGGAACC ++ +AABABFFFFFFFF4GGGGEBGEFGHGGHGHGHHHGGHFBGFGGGGGHHEFHGEHFHFHEHHHHHGCGGGHFHHCFGHE@1EEEFFGB?EEEEHBGGGCHHGHHBEE@>EAHHFAEBDGGGCEFF?CEGFFHHHHHFHHHFHGHCGCAHHF<..C.D=0CD.GFHHHG.CAGHC:CHB0GGGFFB0:CFA.A-;9.;000CBBFFBFGA.9AA;A;EDDE.ABBFFFF9;BEFFF/BF//9/B//9:B//.. +@M01368:8:000000000-A3GHV:1:1112:9832:6701/1 +CGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGA ++ +AABBBBFFFFFFGGGFGGGGGGHHHGGHGGHHHHHHHHHHHGHHGGGGGHHHHHHHHHHHHHHHHHHGGCGDHHHHFGGHHHGGGGGFGGHGGGGHHGGGGHHHHHHGGGGGGHHGGGGDGGFGHGGFCDCHHHHGHHHHGHHFAGFGGEHFGEG.C1>11F1D1DDFFFFGGG1FH3GHGHGHGHFHEC?EC0B1FBGFB1GHHH21AF2HH2FEGEHAE10AAE/>/FFHE?/E/?E1@EFFH?E@/EGGG//B11B//<@@C0F/<0AAAAFFFBFFFGGGGGGGGGGHHFHGHHHHGHHHHHHHHHHHGGGGHHHHFHHHHFHGHHHHGAFFEFHEGHHHHHHHHGHEHHGGFGGGHHHHHHFHHHHHGGHHHHGGGGGHFHHFF?HHGGFECEFFGHFFGFHGECDGHGBGFHGDF@@?CGFHCEGGGFD.CCC?EGHBHHHFHHFBCFFGEB/CEGGGGDAA.90C9CEBFGGBBF/9.9FBFFFBBFF//99FFFFEABF//99FFEFFFBFF +@M01368:8:000000000-A3GHV:1:1113:5741:16959/1 +TAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGA ++ +ABBBBFFFFFFFGGGFGGGGGGHHHGHHGGHBGHGAGFFCAFGHGFFGHHGFHHHHHGGGGGHGHHHHHHHHE3BFFHHHGG0EDF@GHFFGGGHGGGGGGGGGGGGGHHGGEEFHGFHHDDG@DGGGHHGDGGGGGHGG?CF?HHGHHHGHGHHHFFHGGGHHHHGGCD.;0A33>FFFFFF@FFFGGGGFGGD4EGGGGGHHGFFGHFGGHHHFEFHECFHHHEHGAGBCCGCCEGGHGHEGFBFHFHHHHGGGHFHGHEGGGFEGEGG??@DBGHGGC-@FFGHFHFHB-??DA-DD@9:BEBG@B?E?A.ABBFBFA??F.FF/BFFB/B9B/9;BF9FF9FFFFFFFFFFFFFF?BB?;9EE;-;DDB9//;B-B@;ABFFEFFFF/-.9A.;//9/BF./B/..9.9:...////// +@M01368:8:000000000-A3GHV:1:1114:14540:5315/1 +CTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGGGGGCTATTTAGGTTTTT ++ +AABCCFFFFFFFGGGGGGGGGGHHHHHHHFHHHHGHHGHHGGGHGGHHHHHHHGHHHHHHGGGGGHHFHHHFGHHGGFHHHHHGGGGGHHHGHGGHHHGGGGGGHGHGGGGHHGGGGHHHHHEGDDFGFFFHHGGGGGCDAFCFGFDHHHHGGHGHHHHHHBCGEHHHHGGHG.ACGEHGG0CBFFF:A;BB0;09CGF00CFFFE0AA?//CFFFFFFFFFFFFFFFBEF;A.-=A--:BBFB90;;FE. +@M01368:8:000000000-A3GHV:1:1114:15066:16302/1 +TAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAACGGTTGTTAATTAATTATTGCTTGTAGGACA ++ +BBBBAFFBDFBFBGGGGGFGGGBGFFFHGFHHGFFFHGHHHGHHHHFFHHHGHGC?AEFFHEFBFFFGHHHHH1FDGFFHGHGHFEGCGC-<>ABFFFFFFGGCGC1BBF1GHHHBHHHHGFHGH1A100AA/GGEHGFBAA1FFD2FHHHHECG2F1BB/E0FC/A>EE/FGGFGEF1EGBBEHHCGGGHBGEHFHE0B?E--;C?CCGGG@A@GBFBBBB09?@??A-AB--ABE@=-=-9AE9A;FFFFFE=9-//;//;9FF/////;;///9B///;/B////9FFBB;--A@-;/9;9-:-/;;FFFE9BF?@;-9-99/B9F/://///99/; +@M01368:8:000000000-A3GHV:1:1114:2404:13066/1 +TCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCTGTCTATTATACACATC ++ +CCCCCFFFFCFFGGGGGGGGGGHHHHHGHHHHHHHHHFFHHHHHGGGGHHHHHHHHFHHHHHHFGGHHGGHGGHHHHHHGHHFHHHHGGGGGGHHHHHHGHHHHHHHHHHGGGGGGGHH?FGHHHGGGGGGHHGGFGGHHGGGGHHHHHFGGGGFGHGHHGGGGGGGHGGGEGGHHGHHHHHHHHHGFBFFDA0FGGGFFGG0:EFGGGGGGGG;AEBF0B0BFFBFFFFFFFFFFFFFFFFFFFFFEFF0 diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bwa-mem-fastq2.fq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/bwa-mem-fastq2.fq Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,400 @@ +@M01368:8:000000000-A3GHV:1:1101:6911:8255/2 +TCGCCTTACCGCTACTCACCCACGGCGGCCATCAGCCGATACTAAGTTTGGGGTATGGTGGGGGGGATAATGAATTAGGTTGTGGGGGAGGGTTTGTGGTTGAGAGAAACACAAAAAACAATCTTATATATGGGTAGTCGTTTTGTATTGGTTTTTTGTTTTGTTTGTGTTTTGAGTGTCGGTTTAGTTCGGTGTACTAGGGGGGGTGGATGGGGTCGGCTGGTGAGGGGGTCTTAGTGTATTGAGTGTGG ++ +1>11111@11111A111A100000000////011110//>>/12@1@22B/////1@>21/>>/-----9/;////9////--;-;-;-----;--------9/-/-///9-;-------9//////9/////-//-/9-;-;9--/////99-;--9-:-;----/---/-----////---9-/////--;A-//////---------9/-----;-----/-/-----;--;//////////9;///- +@M01368:8:000000000-A3GHV:1:1101:14518:9998/2 +CATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAA ++ +CCCCCFCCCCCFGGGGGGGGGGHHHHHHHHHHHHHHHHGFHHHHGGGGGHGFHHHHHHHHHHHHHHHHHHHGHGGEHGGGGCGGGHHGGCGGGGGHHGHHHGGGGGGGG.BFFFGAGADFGAFDGFGGCFFF;DDFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFF09FFFE00;BE@;DABBFFFFFBBFB00;F:9;FFBFFF9BFFFFFFFFFFFFF90/::BFFFBF0 +@M01368:8:000000000-A3GHV:1:1101:18422:19051/2 +CTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGGGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGG ++ +BCCCCFFCFFFFGGGGGGGGGGHHHGHHHHHHHHHHHHHHHHGGGGHHHHHHHHEHHHHHHHGGHHGGHGGHHHHHHHGHGGHHHGGGGGHGHHHHGGGHFHFHHHHHGGGGGHBFFCGDHHHGGGGGGHGGGGGGHHGCGGGFGHHBGGGGGFFFHEGGGGGCDCCE@EFGHHHHFHEGHGFFHHGB;ECBFGGGEFEFFGF0AFGFGFFG.;;DFFFFFFFFFF090BFFFE?FEFBBFBFFFB990BF +@M01368:8:000000000-A3GHV:1:1101:25545:21098/2 +GCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTTCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCAACCTATGTCGCAGTATCTGTCTTTGAT ++ +3AAA34BFFFFFGCGGCGGGCGHHHGHHHHCGHFGHFHGFGGCDGGBD5FGFFFHFHGGGHAEBFGHHHEHGGFHBGGFHHHGGGEEFHHHFHFFHDHGHGHHHHFFHHGGFAEGFFFFFBGHHHFFHHHHHHHFHFHHFHGFHGHGA/<@EFHEHH1GGHFFFHG1>=FGHFHGF.GE//-AADEBGGGGFFCGGHHA5DFGAE?AAEDFHG2AEE1FBE00A1BGE0?E1FFGEEEGB4F43?EE/>///F??//@BCBAC<<.<<AABBFFCFFFGGGFGGGGGGGGHHHHHHHHHHHHHHHHHHGHHHGGEGGFFHHHHHHGHGHGHHHHHEHFHGHHHHEFHHGHGGGGGHHHHGHHHHHHHHHGFGHHHHHGHGHGHHGGGCCGGGGFGGGGGGDGGGGFGDGGGFFFFFFEFFFFFFFBAF:9AF//;BEF-DD/FFFFBFFFFFFFFFEBFF?FB;/;//BFFEFFE/::..9;-:B9:FBFFFFFFBFB/9/FFF/B///99/:B/// +@M01368:8:000000000-A3GHV:1:1102:17584:10050/2 +GTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATCTGTCTCTTATACACATCTGACGCTGCCGACGAGCGATCTAGTGGAGATCTTGGGGGGTGG ++ +ABCCCFFFFFCFGGGGGGGGGGGGGGHHHHHHHHHHHHHHHHGHHHFHGGHGGHHHHHHHHHHHHHHGHHHGGHHHHHHHGGHHHHGGGGGHHHHHHHHHHHHHHGGHHHHHHGHGHGGHGGFGGGGGGGGGGFGGFGGGGGGGGGFFFFFFFFFFEFFCFAAFEFEFFFFFFFCFDFFFFFFEFFFFFFFFFFBFFFFFFF/:BFFFFFFFFFFFFFFFFAFC=@-.;A///B././//////.-9---- +@M01368:8:000000000-A3GHV:1:1102:18842:24844/2 +CTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGG ++ +CCCCBFFBFFFFGGGGGGGGGGHHHGHHHHHHHHHHHHHHHHGGGGHHHHHHHHFHHHHHGHGHHHGGHGGHHHHGHHHHHHGHHGGGGGHFHHHFGHHHHHHHHHHHGGGGGHHHGFECGGHGGGGGGHHHGGGGHHGGGGHHHHHEFDCADGHHHHGGGGFC?AE-@DDFHFFHHHCFHGGFFGFFFGGGEFEGFGED9C?CEGF90FEF/A./BBF9FFFFFFFFBFFF.9AF////;FFBFFB/:BF +@M01368:8:000000000-A3GHV:1:1102:21788:11027/2 +AATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCCTG ++ +CDDDDFFFFDDFGGGGGGGGGGHHHHHGGGGHHHHHHHHGHHHHHHHGGHHGEHGGHHHHHHHHHHHHHHGGGGEHHHHHHHHHHFFFFFHHHGGGGGHFHGBFEHHHGGGGGGGHHGGGGHGGGGGHHFHHHGGG@CGHHHGGGGGGGGGGGEGDGFFDHHGEHHBCFHGBGFAGGHFGGCEEF0BE///FDFACF?FHFGC//??1B0<FF>DFCFDFFFFFFFFF-BBAFFFFFFFFE@D9B;FBFFFFFFFFB9/9FFFEBF9/B/BB/BFAADDFFC-;;9-;@@9-.9/://///://///...- +@M01368:8:000000000-A3GHV:1:1103:25465:20010/2 +AACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGTGCACCCTATGTCCTGTCTCTTATACACATCTGACGCTGCCGACGAGCGATCTAGTGTAGGTCTTGGTGGTCGTCCTCGTATTTGAACACATAATCGTG ++ +AAABBFFFFFDFGGGGGGGGGGHHHHHHGHHHGFHHHHHHHGGGGGHHHHHGHHHHHHHHEHHHHHGHGGHHGGGHHHHGGGGGHHHHHEHFGEHHHEE14BGHHHGHGHFCE/EFGGGGGGHGGGGGG?FFGGGG.ADGGGFFDA.FFEC--;9BBBD?FFFFFFFFBBFFFFFFFFFFFFFFFFFADFDFFC-BBB;BF-.;9:B9B9/9/;9;//;:.99/;-...:.../;//////.9/////.;. +@M01368:8:000000000-A3GHV:1:1103:25519:6301/2 +CTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATGCTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGGGATGTCTTATTTCTGTCTCTTATACACATCTGACGCTGCCGACGAGCGATCTAGTGTAGAT ++ +>AABBFFFFFFFFFFGGGCAFGCGHGGGHGGGHHFHHHFHHGGGGGFFFHHHBHHHHG5BG5GFEEEE1FGFFGF1EGHGGG/BFHEF@EA1BE1@EED3?4DHGGGGEGHFFFDDG?DCDEG@<?@DGCAGC-;@DGFFHGCHEEFGGFGFB/;..BBB.CCE?A0FE0EBFF0FEF.;=DFFB//BFBF//;/:9BA.9FB9//;9//;9F;FF +@M01368:8:000000000-A3GHV:1:1104:13832:4398/2 +GACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGCTGTCTCTTATACACATCTGACGCTGCCGACGAGCGATCTAGTGTAGATCTCGGGGGGTGCCGGCTCATTAAAAAAAGAAGATGTCGAGATGTCAGTGCGTGAGTGGAAGTG ++ +BBCCCFFFFFFFGGGGGGGGGGGGGHGGGGGGGHHHGFGGHHFGGGHGHHGHGFGECGHHHHGGGGGGGHEGFGGGHHGHFHHHHFGHFHHHHGHHHHFGGFGG3FFGGHHHHHHH2?FFGHFHHHHHHHHHBGHDGC0FGHHGHHGHHHHHHHHHHFGHHGGGGGHG-:@?FAGA:.:09CB0000;9B0.9-;---:./----.0;B00;:0;;.../90009..-..00000000-...90:.00/00 +@M01368:8:000000000-A3GHV:1:1104:23670:4161/2 +CGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAA ++ +A3AABBFFFFFFGGGGGGGGGGHHFHHHHHHGHGHHHHHGCHHHHGGGGHHHHHHHHHHHHHHHGHHGHGHGGHGGHGHGGHHGFGHHGHHHEHHGHHHHHGHEHEHHHGGFGGHHHFHHHGHHHHHHHGHHHHG4EGGGFCBBCA?DGGGGGGCCAFFFFFFFFFFFFFFFFFDFFF.9@--=AFFFF;FBFFBEFFF9FBFFB9/FFFFFFB/FFFFFFFBFFFF/BBFBB/FFFE?.;@FFB?FBB// +@M01368:8:000000000-A3GHV:1:1104:24927:22253/2 +GTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGGGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGGGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACGAG ++ +AABBBFFFFFFFGAFGG5GGGGHDGHHHHHHHHHHHHFHCFGC2F3BFGFGHBGHHFHEDHFGGGGGHHHG3F?GHHGGC/EFEHFGAEFFHGGAEHGGGG3EEGEEDHFHAAD@DGGGHG/AC1>GED-<<-.;C0:0C:C;GH:::::0B90;FFGEGFBFF0C.;9F0;0090;900;00FA9D.A...-9=BBFFFE/FE9;9E//BBBB.BBB///B/9/B..;. +@M01368:8:000000000-A3GHV:1:1104:4705:7727/2 +TCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCG ++ +ABBBAFFFFFFFGGFGGFGGGGGHHHHFHHHHHHGGGGHHHHHHHHHHHHHHHHHHHHHHEDEFGHFHHHHGHGHHHHHHHGHHHHHHHHFHGHHHHHGGGGGHHHGHHHHHHHHHHHGHHHHFHGHGHGGHGGGDCD.;FFGGGGGGGGFFFFFFFFFFFFF.>FFFFFF-BBEFFFFFBFFFFF-DDFFFFFFEFFFFFFFFFFFF?F///9FFFFF/;FFFFF?AADCFFEFAFFEFFFFFB/;B/A9 +@M01368:8:000000000-A3GHV:1:1104:7756:4776/2 +CTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGG ++ +BBBBBFFBFFFFGGGGGGGGGFFHHGHHGHHGGHGHHGFHHGGGGGHHHHHHHHGFF3GGGHGGGHFDHEGHHFHHHHGFHBCGHGEGFFFFFHHHHHGHHHHHGHHHGGGGGHHGHHGGHHHGGGGGGHHECCFDHHADGGHHHFGFDC////FHHHDGFCGCDEECF?EGGFFHHFEHFGGHHFFF/A900;C.9..C0990.;CFBFGE/9.9B99/;B.99/FB///9AEF/99/9/BBF/B/B//A +@M01368:8:000000000-A3GHV:1:1105:11310:21761/2 +ACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCTCC ++ +BCCCCFCCCCCCGGGGGGGGGGHHHHHHHHGGGGGHHHHHHHHGHHHHHHHHHGHHHHHHHHGHHHHHHHHHHGGGGGHHHHHHHHHHGHHHHHHHHHHHGGHGGHHHHGHHHDGHHHHHHHHHHHHHHHHHGHHHHHGGGGGGHHHHHGHHHHHHHHFFHHHHHCCAHFHHGGFFFFFFFFFAA;BDDFFFF00.-=D.ABDA-.::AAF;F@.AAD./0000.9-;.9:00009:F090;FF000900; +@M01368:8:000000000-A3GHV:1:1105:11578:14676/2 +GCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGT ++ +CCBCCCBBCCCCGGGGGGGGGGHGGGGGHHHHHHHGHHHHGHHHHHFFFGGHGHHHHHHHHHHHHHHHHFGHHHHHHHGHHGHFHFGHHHGGGGHHHHHHHHHHHHHHFHHHHHHHGGHGGHHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHDGGCGHHHHHHHHHHHHHFF>ABFDCFFFFFDGFGFDGDEG1EGEFFBFHFGHA0E00A0CAAC0FFDFFAGHFEHHFDH/EAFCFFHACGHHAFFEF2DFFG/FGHHHHGHGH0C@/E@GGC0FG10>GGFGHHGHFFBGFGB2>F@;-A@FF<-;---@-;-----;-BFF/;---9//BBFFFF//B;FF///9BF///;/;B/BBFBFFBB/;-9--///;9/:-;BFFBBB9F +@M01368:8:000000000-A3GHV:1:1105:15502:11453/2 +GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACCTGTCTCTTATACACATCTG ++ +CCCCCFFFCFFFGGGGGGGGGGHHHHHHGHHHHHGGGGGFHFHHHHHHGHHHHHGHHHHHHGHFHGHGGGGGGGGHHHGGGGGGGGHHHGHGGGGGGGGGGGHHGGGFGGGFEGGGGFFFGGFGDGGGBFGFFFFFFFFFFFFFE0BFFFFFFFFFFFFFFFFFFF;BFFDFFFFBF0BFFBFFEEBFFAA009FFFFFBFFEBB0;.BFFBFFFFFFFFFFFF0;BFFFFE//090::B0000:009900 +@M01368:8:000000000-A3GHV:1:1105:16787:9771/2 +TCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCG ++ +CCCDCFFFFFFFGGGGGGGGGGGHHHHHHHHHHHGGGGHHHHHHHHHHHHHHFHHHHHHHGGFGGHHEHHHHHAGHHGHBHHHGHHHHHHGHGHGHHHGGGGGHHHHHHHHHHHHHHHFFHHHHHGHGHGHHGGGACAFGGGGGG-AFGGFFFFFFFFFFFFFFFFFFFFF-9@AFEDFFFFFB.9-AABBBFFFEFFFFFFFFBFFFFFFEBB0BBBFFBFFBFBEDEAFFEB.ABFBFBFFFF0::/.- +@M01368:8:000000000-A3GHV:1:1105:17564:15723/2 +ATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTT ++ +CCCCCCCCCFFFGGGGGGGGGGHHHHGHHGHHHHHHGHHHHHHFHHHHGGGGHHGHFHHHHHHHFHHHHHHHGHGGHGGHGHHHHFHFHHGHHHHHHHHHHHHHEHGHHHHHGGGGGHHHHGEHHGHHHHHHGHFGHFHGGAHGGHGGGCCBDFGFFAFDBDDFFFFFBDA;AAD-ADC?AFEFB-@@AAEFEFFFFF..-:.B0:FFFFFFEF0BFF09FF.0;9BFFFFF0BFFFFFF?.::-909;.9 +@M01368:8:000000000-A3GHV:1:1105:18108:12091/2 +GTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTACTGTCTCTTATACAC ++ +CBBCCFCCCCCFGGGGGGGGGGHHGHHHHFHHHHHHFHGHHHHFHFHHHHGGGGHHHHHHHHHHHHHHHHHHHHHHGGHGGHGHHHHGHFHGHHGHHHHHHGHHHHHHGHHHHHGGGGGHHHHHHHHGHHHHHGFGHHHHHGHDHGHHGCA;?AAFGFEFDFFF.FFFFEBDFFFFFFFFBFFFFFFFFFEFEFFFEFFFFFFF.:BFFF0BFFFFFFFFFFFF;0FF0;BFFFFFFFFBFFFBFEFFFF0 +@M01368:8:000000000-A3GHV:1:1105:18331:16683/2 +TTATTATGTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGGGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATATCT ++ +CCCCCFFFFFFFGGGGGGGGGGHHHHHHHHHHHHHGHHHHHFHGHHGHHHHHGGFGHHIIHHIHHHHHHHFHCFGHEEHFFHGHHFGHGHGHFHHGGGGGGEEGHHEHFHHGGHFHHFGGGGDFGBCGG?DGHGGFCGGHHECEF?CHGGDGFFGGHHC?EEGGHHHH.>AEGGGHEE-AEG0C0GCCEFFFFGEFG/;CEGGGE.:CAE.;@.@EFBFFFF/..:BFF0:BB00FBFFFFBDAFFF0;0B +@M01368:8:000000000-A3GHV:1:1105:20577:18421/2 +ATGTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGCCGAAAATACCAAATGCATGGAGAGCTCCCGTGAATGGGTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTT ++ +?ABBBFFFFFFF4CFFGGGFGGHHHHHHHCEGHHCHFGHCAFEFHHHGGAEGGHHDHHHBGHBGHHH?5BGE1AEEFHHHHHGHFHBGHGEGGGGHBFHHFGHBGCFFG4EGHGG?E/FG3F3F/FGCDECEA/FGG/CEDBB//?BHEHHF0ADC@D>D9@DDCFFFFFFCFDFE.AA;/BB/.;-.;BFBBBFFBFFFFFFF/9FF?/;/BBFFFFF/;FFB/ +@M01368:8:000000000-A3GHV:1:1106:17389:18671/2 +GTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGGGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTCTTTAGGGTTTATGACCCTGA ++ +BBCCCFFFCBCCGGGGGGGGGGHGHGHHGFGFGFHGGHHGHHHHHHHHFHHGGGGAGFBGHHHFBBGH5GDHH5EEEGEHEGFBF1DGFDGFGGGBBCEEGEHFEFDDHFBFHFD/@BGDGDHHGDCCFCD/C<->.ADDGE1GA;9.FFFFFFD-99ADFFD-.9FADF--9..;AAFEFFEAA-;.///BB:BBF/BBB//B//99/;BBB/9////F +@M01368:8:000000000-A3GHV:1:1107:18714:24788/2 +GGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTAGGTCGCAGGAACTGTCCTTGT ++ +BCCCCCFFFFFFGGGGGGGEGGHHHHGGFFHHHHHHHHHHGGGGGHHHHGHHHHHHHHHHHHHFHHHHHHHHGHHHHHHGHHHHGGCGHHHHHHHHHHHHHHHGHHHHHGDGHGGHHHHHHHHGHHHHHHHHHHHGHHHHHHEHHHGHFGGGGHHHHEHFHHGHFHHF.>GGHHHGHGHGGHCFFCDFDFFFFFFAAF.FFFFFEFAFCFFA..@.AFFFC9--9;.;A///00--;A-.;.000000090 +@M01368:8:000000000-A3GHV:1:1107:21355:8333/2 +GTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCGGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACG ++ +AABCCFCCCCCFGGGGGGGGGGHHHGHHHFHHHHHHGHGHHHHHHGHHHHGGGGHHHHHHHHHFFHHFHGHHHHHHGGHGGHHHGHHHHGGHFHHHHHHHFHHGHHGEFFHFHHGGGGGGHHHFHHHHGHHHHE?FGHHHHGFGHGHHGGGDGGGGGFFFCFFFFFFFFFFFFA;DBBDFFFAFFFF;DBAFEDFFFFFEA?-9...9;9;BFFFF//BFBFFFFBFBFFFFFB/BFFF9B/EDEF-BFF. +@M01368:8:000000000-A3GHV:1:1107:22864:15712/2 +GTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTCTGTCTCTTATACACATCTGACGCTGCCGCCGAGGGCTCTAGTTGTGGTCTG ++ +AABCCFFFFFFFGGGGGGGGGGGGGHHHHHHHHHHGHHHGGGGFHGHHHHGHFGFFFFGGHGGGGGCHFHFAFEDHHGGFGGGHHEGEGGHHEGGCHGHHHHGGGGGGHGF0EECGGGGFECCFFGFBGHHHHHHGGHHGHFCFEHFBEFGGADDDC=CC0CCFHG:CEHHHGHBGH:GFFGFF0CAAA/BF09BBBBBBB9CFFB0C00;000;:CFFB;:...D?>:@=:9-@...//;////..;/;/ +@M01368:8:000000000-A3GHV:1:1107:24817:14233/2 +CTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGGGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAAAGTGTGGGCTATTTAGGTTTTT ++ +CCCCBFFFFFFFGGGGGGGGGGFHHHHHHHHHHHHHHGHHHGEHGGHHGHHHHHGHHEHHGGGGEHGHHHGEHHGHHDGHHHHGGGGGFGEFGGEGHHGCEEEGHHEFGGDHGGGGAGFHHHHGGF/?//GHFFDGGGGGHAD??DC1DHHBGHHHHGHF<FGGGGGGGGGGGFHHEFGHHHGGGGGGHHHCGGEFHFGGGFHHHHHGGGEGGHHFHEEGEG/EEEE/?/EHFHH3GGH4EGHHCB01CGHGHGD//<@CGGBFFDGHHHH0FEHHGEGFHCD11>0=GGHGCCFGHGBGGD0GGHC0C.@?ACEEC/.EDEB?BFFCFB9CFF090BBCBFBF0CCB0B00A.9E.B/9BA=-..B9FBF//;BBBBFFB:./9/9/9;B//9BF/;B/ +@M01368:8:000000000-A3GHV:1:1108:11059:23181/2 +AACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACC ++ +CCCCCFFFFFCCGGGGGGGGGGHHHHHHHHHHHGHHHHHHHHGHHHHHHHHHHHGGGGHHHHHHHHHHHHHHHHHHHHHHGGHGGHHHHHHHHFHHHHHHHHHHHHHHHHGHGHEHHGFCGGGHHHHHHHHHHHHHHHFGHHHHHGHGHGHHGGGBFGFFFFEAFFFA.BFFFEFDDFFFFFFFFFFEEFF;D;EFFFFFF0BFFFADDFFFFFBFBFFFFFBB0FFFF0::0BF000BFFFBBF0EDFF@ +@M01368:8:000000000-A3GHV:1:1108:12233:13498/2 +GGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACTGTCTCTTATACAC ++ +ABBBCFFFFFFFGGGGGGGGGGHHHGHHHHHHBGHHHGGGGHHHHHHHHHHHHHHHHHHHHHHGGHGGHHGHHHHHGGHHGHHHHHHGHHHHHHHGGHHHHGGGGGHHHHHHHHHHHHHGHGHHHHHHGHGHGGHGGGGAGGFGGGGGGGGFFFFFFFFFFDFFFFDFFFFFFFFF>.AEDEFFFFF?E-DAB9FFFFFFFEF:FBFB009:0FFFF9BFFBFFFFBB0.9A=-/0BFF0BBFFBFF:90: +@M01368:8:000000000-A3GHV:1:1108:14328:14751/2 +GTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATCTGTCTCTTATACA ++ +ABBBBFBBBBBFGGGGGGGGGGHHHHHHHHHHHHHHHHGHHHHHHDHHHHGGGGHHHHHHHHHHHHGHHHHHHHHHGGHGGHGGHHHHHGHHHHHHHHHHHHHHHHFHGHHHHHGGGGGHHHHHGHGHHHHHHG?FHHHFHGHGHGHHGGGGGGGGGFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFAFFE.ADEFFFFFFFADFFFFF///BBFF/FFFFFBFAFFFFBB///;9:BBFFFBF9B9//:/ +@M01368:8:000000000-A3GHV:1:1108:16350:22621/2 +CATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATAT ++ +CCCCCFFFFFFFGGGGGCGGGGHHHHHHHHGHHHHGHGHHHHGGGGHHHHHHHHHHHIHHHGHHHHHHGGHGGHHHHGHGHGHHHHHHHHHHHGHHHHHHGGHHHHGGGGGHHHHHHHHHHHHHGHGHHHGHHGGGGGHHGGGDADDGFFGGGGFFFFFFFFFDFFFDFFFFFFFFFFFAFAEFEDFFFFFFEE-AAFFFFFBFFFFF0FFFBFF;AFFFFFFFFFFFFFFFFFFFFFAB9BFFFFFBFFF +@M01368:8:000000000-A3GHV:1:1108:21791:23272/2 +CATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCT ++ +BBBBBFFFFFFFGGGGGGGGGGHHHHGGHHHHHHHGGGEGGHHFGDHGHHHHHHGGHHH2GGHHHGHHGGGGFGFBFHHHGGGGHGHGHHHHHHHHHHGHGHHGHHGGHGGHHHHGGFHGGGHFHGHHHHHGGHHHGHGFCGFFDGGG/FCFHBGHHHHHHHGHCHHHHGGGHGHGHGCFF;>@DFFBFFFFFFDFFFFFFFF;;.DDAA;FFF;BB;C@..9::BBFBBA;-:.::FBB///;/////:/ +@M01368:8:000000000-A3GHV:1:1108:23314:26089/2 +CATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCC ++ +BBAAAFFFFFFFGGGGGGGGGGHHGDHHHHHFHHHHGGGGGFHFFGFFHEGHHHHHHHHFGHFHHHHDEHHHHHHGHFHHGGGGHHGFHHHHHHHHHHHHHHHHHHF?EEFCHFGFHHEFFFHGFHHGHFHFHHHFFHAG??GHACGGDGHGGBFHHHHHHHGHFGHGHGEGHGCFFF@CD-BDDFFFFFFFFDDFFBBFFD-B-A.DFFA.AFEFF-;-..:..BFFFFFA-../9FFF//B/BBBBFF/ +@M01368:8:000000000-A3GHV:1:1108:25084:18127/2 +CCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTAATTAATTAATGCTTGTAG ++ +CCCCCFFFFFFFGGGGGGGGGGGGHHHHHHHHFHHHHHHHHHHHHHHGGHHGHHHHHGGGGFHHHHHFHHHHHHHHHGGHHHHHCGGHGHHGGGGGGGHHGGGGGGGGHHHGHHGGGGGGGGGGGGGGGG@CDE?EFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFFE;DFBFFFFFFFFFFFFFFFFFFFAFFFFFFFBFFE0BBBFFFFFFFFFFBFFFF9BB0:0:B; +@M01368:8:000000000-A3GHV:1:1108:7626:10048/2 +AACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATCTGTCTCTTATACACATCTGACGCTGCCGACGAGCGATCTAGTGGAGATCTCTGGGGTCGGCGGATTT ++ +BCCDDFFFFFFFGGGGGGGGGGHHHHHHHHHHHHHHHHHHHGGGGGHHHHHHHHFGHHHHHHHHHHHHGHHHHGGFHHHFGGGGHHHGHHHHHHHHHHHFGHHHHHGHGHGHHGCGCGGFHHHGGGGGGGGGGGGGGGGGGGFFFAFFFFFFAA-AFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBBFFDFFFFFB-<>;>@BE<:000.0.0/0<00<0:>..-<<<---.0 +@M01368:8:000000000-A3GHV:1:1109:11270:12255/2 +CTACCTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCTGTCTCTTATACACATCTGACGCTGCCGACGAGCGATCTAGTGGAGATT ++ +BCCCCFFFFFFFGGGGGGGGGGHHHHHFHHHHGGGGHHHHHHHHHHHHHHHHHHHHGHGGHGGHHGHHHHHGHHHHHHHHHFHHHHHGHHGHHHHHGGGGGHHHHHHHHGHHHHHH?FHHHHHGHGHGHHGGGGGGGGGGGGGGGGGGGGFFFFFFFFFFFFFFFFECADFFFF.DEBFFFFFF;D.B;.ED.FFFFFFF-.:00009BBF +@M01368:8:000000000-A3GHV:1:1109:24217:19077/2 +TAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGAACCTGAAGTAGGAACCAGATGTCGGATACAGTTCACTTTCTGTC ++ +BBBBBFFFFFFFGFEGGCFGFGHHHHFHHFHHHHHHGGEEEFGHGFFAGFFEEAEGGHHBEEFAFGAEF1G3GFGGEEF@EAGGHGFF?ECGGHEEEFGGHHHHFHHFHHGFHHHC2DAGHGG00-BFFA9FFFFFFFFFFFBFAFFAFFFFFBFFFFFFF/9BFFDFFDDFF-9@CDAD;:AFF/;B//....;;///; +@M01368:8:000000000-A3GHV:1:1109:27103:7153/2 +NTACTTCAGGGTCNTAAAACCTNAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCANGGNAGCTCTCCNNGNNNNTGGTATTNTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACCTGTCTC ++ +#>>AAAFFFFFFG#AAFFGGGG#BAFFAGHHHGGGGHGHHHHFHHHHHHHGHHHHHHHEGFGGHHBGHGHHBGFFEG2GHHFHFHHHGHHGHFH#/B#??FFGHHH##?####????FHG#??CFGGGEF>CGAGAGGGFGAA@@FFFFFFFFB=@@FF?FF;FFFF@@-@-AFEF?BB/BF?E-BBFFF;:BB/BFFFB/BFFFBFBFBBFFFFFB/FFFBB/;9A@@BFFBFFFF/BBFFFFFFE/:9/ +@M01368:8:000000000-A3GHV:1:1109:8763:7859/2 +GGTTCCTACTTCAGGGTCATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAG ++ +>AAABFFFFFFFGGGGGGGGGGHHHHHHHCHFAFHHDGEGFHHHHHHHHFHHHHFHHHHHHGHGGHGGHHHFHGHHHHHFFHHHHHHBGHFFHGHGHHHHHGGEGGHHHHHGHHHHHHHHHGFGHHHHGHGHGHHCCGGGGGGGGGGAAGF?FFFFBFA@=BDDFF?-;FFF;@CFFEFEFFFFFFFFFBFDFFFFFFFFFFFFFFFFFFFFFF:FFFF/BF/FFFEFF?DECAFEFFFFFE:FFFFFFFB +@M01368:8:000000000-A3GHV:1:1110:12808:18367/2 +GCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACAGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTAATTCTT ++ +ABCCCCDDDFFFGGGGGGGGGGHHHHHHHHHGGGGGGHHHHFHHGHHHHHHHHHHHHHGHHHGGHHHH3GFHEFHHHHHGHHGFHHHHHFGHHHHHGHGHGGHGGGGGGGHHHGGCGGGGHHGGGGGGGGGGGGGGGGGGGF-BAEFEBEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFFFFFFFFAFEFDFFFFFFFFFF0BFBED=@DFFFFFFFFFEFFFFFBFAFB;0:;F0 +@M01368:8:000000000-A3GHV:1:1110:14751:26540/2 +ATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGGTTGGGCTATTTGGGTTTTTTGACCCTGAAG ++ +BBBBBFBBBBBFGGGGGGGGGGHHHHGGFHGGHGGCHHHHHHGHHHGHHGEGGGHHHHHHHHHHEFHHHHHHGEE?FGHGEHGGHHHGGGGGGEHHGGGGHHGGGEGHHHGHGGGDFGHHGHGGGGGCGAFFCCDFHHHH1CGGHBHFHFCGFFCDFHGCCE.CC:EGFHHCGHHGCF?FGGGFGGG;B9;FFFFFFEGEBF0;;;0;CFBFFF?D.DE.:..99-...;;//9A.;DE.9-./:FB.;// +@M01368:8:000000000-A3GHV:1:1110:15519:10008/2 +GGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAG ++ +BBCBCFFFFFFFGGGGGGGGGGHHHGHHHFHHBGHHHGGGGHHHHHHGHHHHHHHHHHHHHHHGGHGGHHGHHHGGBGHHHHHHHHHHHHHHHHHGGFHHHGGGEGHHHHHHHHHHHHHHHGHHHHHHGHGHGHHGGGGGGDGGGGGGGGGBFFFFFFFFFFFFFFFFFFFECB-=BFFFFFFFFBB?E-A::FFBFFFFFBFFFFFFFFFFFFBFFEFFF;0BFF0:FE?FFFFBFFFFFFFFFFFFFF0 +@M01368:8:000000000-A3GHV:1:1110:16559:7507/2 +GCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTAATTAAT ++ +>AABBCCCCFFFGGGGGGGGGGHHHHHHHHHGGHGGHGHFHHHHFHHHHHHHHHHFHHGHHHHGHHHHHGGEEGHHHHHHHHHHHHHHHHHHHHHHGHGHGHHGGGGGGGHHGGGFGAGFGGGGGGGGGGGDDDFAGGFFFFCFAFEEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFFFFFFFFFFFE..@BFFBD./;:0FFFFF09B/AFCA=-99BFFBFFFFFFFB:BFBFFBFB9: +@M01368:8:000000000-A3GHV:1:1110:16700:6843/2 +GTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTCTGTCTCTTATACACATTTGTCGCTC ++ +CBCCDFFFFFFFGGGGGGGGGGHHGHHHGGHFHHHGGGGGHHHHHHHHHHGHHHHHHHHHHHGGHGGHHHHHHHGBGFFHHHHHHGHHHHHHHHGHHHHHGGGGGHHHHHHHHHHHHHHHFHHHHGHCG/GGHHGGGGGGAFGEEADAGGDFFFF00D>CFDDFFFB?AF?B;-@>.FEDFFFFF0BF-BDFFFFE00B0BF0FFFF:BFA0B0:BBFBBFFFFF0B0:000<00FDFFFFFFFFFFFBFFFFFFFFFFF/FFF/BFFFFFFFBA.?@;FFBDFFFF/ +@M01368:8:000000000-A3GHV:1:1111:7437:24508/2 +ACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTCTGTCTCTTATACACATCTGACGCTGCCGAGA ++ +BBBBBFFBABBBGGGGGGGGFGHHHHHHHHHHHHHGHHHGFHHHHGGGGDHHHHHHHHGFHHHHHGHHHHHHGHFHGHHGGGDFGDFFGGGFFGFFGHGHHHGGC?AD-:A-AFFGGG-?9EFEFGGFFBFEFCFFFFFFFBBBFFFFFFFFFFFFFFFBBFFFFFFFFFFFFFFFFFFEFFFFFFBFFFFEFEFFFF;CF-FFFFFFFF/BFFFFFFFFFFFFEFBFFFB/FFFFFFF//9A..;F--@9 +@M01368:8:000000000-A3GHV:1:1111:8665:25542/2 +CTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGACAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTA ++ +CCCCCFFFFFFFGGGGGGGGGGHHHHHHHFHHHHCHHGDGGGGHGGHHHHHHHHGHGHHHGGGGGHFHHHHFHHHFFHHHHHFGCGGEHFHHFGGHFHGGGGGGHHFEEEAFEFFDFHGHHHHGGGGGGGHH0DGGGGCGHCC-FFFFFFFGGGGGGGGGGHHHHHGHHHHGHHGGHHGGHGGHHHGHHHHFHHGHHAFGGGHHHHHGHHHHFHFHHHHHECGGGHGHHHEFHFHGGGGGGGGFGFFGFFEFGEGHHHGHHGGDGGGHGFGGGGGGGGE@GCGGHHHGHHHHHGHHHHFHHGEGFFEGCE.ECCECEHCGGFFE/AABFFG0CFE0BFFF9CFGG.F/BBBFFGFGGGFFFFFFBFEDAEFFFFFBFFFFFFF0;FEBF: +@M01368:8:000000000-A3GHV:1:1112:10272:7828/2 +GTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGTGTGGGCTATTTAGGTTTTATGACCCTGAAGTAGGAACCAGATGTCGGATACCTGTCTCTTATACACATCTGACGCTGCCGACGAGCGATCAAGTGTAGTGCTCGGGGGGG ++ +BBCCBFFCCCCCGGGGGGGGGGGGGGHHHHHHGGGGGGHHFHGGGGGFGHGGGGGGHHHHHHHHHHFHHHHFHGFFHHGF?ECEFFFFFFHHHHHH?FHHHHHHHHHGFHHHHHHGGGFGHHHHFHHHHHHHHHHGGGFGFGCGGGGHHHHHHHHHHHGGFFHGHHHHHFFHFCGFHCCGHGHCGGHG?DGGGGGFBBFGFBFBBBGGGB0000;CD?DFFAFF9=--;...://;////////.------ +@M01368:8:000000000-A3GHV:1:1112:9832:16531/2 +GTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACG ++ +BBBBBFBBBBBFGGGGGGGGGGHFHHHHHHHHHHHHHHGHHHHGHFHHHGEEGGHHHGHHHHHHHHHHHHEHGHHGGGHGGHGHHHHHGHHHHGHHHHHHHHHHHHHHGHHHHHGEGCGHHHHHGHHHHHHHHHHHHHHGHFFGHGHHGGGDACDFFFFABFFFFFFFFFFD-BC?DA9.DFFBEA==B=?F.;ABBFBFEF-DAF/BFF9/FFFBB/9BFFFFFFF/BFBBFFFFFFF/9FEA?--B/;. +@M01368:8:000000000-A3GHV:1:1112:9832:6701/2 +GCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATT ++ +BBBBAFFFFFFFGGGGGGGGGGFGHHHHHGHHHHHHHHHHHHCGHHHHHHHGHHHHHHDGHHGGAECHHGHHHHHHHGHHHGHGFHHHHGGHGGHHFHHHHHHFHHHHHHHHH2GHHHHGHGHHFHHGGFGGGHHGHHHHHFHHHHEFHHHHEFGFFHGEHCFGDFDFFFFFCFFFFFFFFFFFFFAFBFFDFFFFFFFFADCF?9?EFBFB9EF-A.;BFFFFFFFBFFFFFFFFFFFFBFFFFFFFFFF +@M01368:8:000000000-A3GHV:1:1113:13532:8618/2 +GCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTGGAT ++ +CDDDBFFFFFFFGGGGGGGGGGHHHHHGGHHHGHHHGHHGGGGEHHGGHHHHHHHHHHHHHHGHHGHHHHHGGHHHHHFHHHHGCDCGHHHHHHHHEHHHGHGHHHGFHGGHGGHHHHHHHHFHHHFHHHGHHHHHHHHHFGGHHHHGGGEGAFHHGHHGGHHHHGHFBGFHGHGHGHGGHGGF@DAFEFBECAB=A.0BFFFEDF=A9.@D.-AAFFFA-:-..:.9/0:;0..-..0000::BFB09:: +@M01368:8:000000000-A3GHV:1:1113:21679:18011/2 +TTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCTGTCTCTTATACACATCTGACGCTGCCGACGACCGAGCTAGTTGAGATC ++ +BCCCBFFFFFFFGGGGGGGGGGHHHHHFHHHGHHHGFGHGGHGGHHHHHHGHHHHHHHGGGGFHHHHHHEGHHHHHHHHFGGFCGEHFHFGEGHHHFGGGGGHHHGGGEHGGGGGHHHHHHGDCCCGHHHHDGGGGCD/FCC->DFFF. +@M01368:8:000000000-A3GHV:1:1113:5741:16959/2 +GTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACG ++ +BBBBCFCCCCCFGGGGGGGGGGHHHHHHHGHHHHHHHHGHHHHGHDGHHHGGGGHHHHHHHHHHHHHHHGHHHHHGGGEGGHHHHFHHHHHHHHHHHHHHHHHHHHHHGHHGHHGGGGGHHHHHHHHHHHGHHHFHHHHHHGHGHGHGGGGCGGFGGFFFFFFFFFFFFFBFFFFFFFFFFFFFFAFFFFEAEFFFFFFFFFFF9BFFBFFFFFFFFFFFFFFBFFFFFFFFFFFFFFFFFFADAB-/BF. +@M01368:8:000000000-A3GHV:1:1114:10130:11959/2 +ATCAGAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATCCATCGTGATGTCTTATTTAAGGGGAACGCTGTCTCTTATACACATCTGACGCTGCCGACGAGCGATCTAGTGGAGAGTTCTGTGGGGCCGGGGATCCTTATAAAAAAAATAGA ++ +BCCDDFFFFFFFGGGGGGGGGGGHHHHHGGHHHGGGCGGHGHGGGGHHGGGGHHHHHHGGGEGGHHHFGGGGG?E1FE?/EEHHHHHGHHGHHHHGHFHGHGHHGDGGFG2FF2?GHHHHHGCCCFHGHGHHHHGHHFEHHFGHHGHH<1=DGHHHGHHGHGAGAEEDG.CGCGHC0CGBFHGFBBF0ABDDEFF@?--:BB@.;:BF;0.0:0.0:000:BF.-.------.9/;000::0;0:--.000 +@M01368:8:000000000-A3GHV:1:1114:14540:5315/2 +CACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCC ++ +CCCCCFFCCDCCGGGGGGGGGGHHHHHHHHHGGGGGHHHHHHHHHHHHHHHHHHHGHHHHHHHGHHHHFHFHHHGGGGGHHHHHHHHHHHHHGHHHHHHHHGGGGGGHHHFHHHGHHHHHHHHHHHHHHHHHHGHGHHHGGGFGHHHHHHHHFHHHHHF?1FHHGHGHGHGHHGGFFFFDBFBE;BCC.:BFFFFFFFFFFFFFF;AFFFFF-=-.AEDEFFFFF..9A;9FFFF0FFFFE00FFF0:BA. +@M01368:8:000000000-A3GHV:1:1114:15066:16302/2 +TTATTATTATGTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGGGATAGACCTGTGATCCATCGTGTGG ++ +CCCCCFFFFFFFGGGGGGGGGGHHHHHHHHHHHHHHHDGHHHHFHHGHHGHHHHHGGGGEHHHHFHHFF5FHHFEGHHHGDHGGHGHGFGGGEHFGHHGGGGGHGHHHGHHFHHB3FGHHFGGGG?GFFHCCEBGFFECCDFEGFCFGCHHGFDDHHHGHHCFGGGGGFBFDGFG?-:..AFG.-C0C009;:00;00:9/:CEFFF?AE::9;9?0:FEF0;0..--./00::B/090000000;A.... +@M01368:8:000000000-A3GHV:1:1114:16639:15258/2 +TTATTATTATGTCCTACAAGCATTAATTAATTAACACACTTTAGTAAGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAGGATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTCCAGCGTCTCGCAATGCTATCGCGTGCGCACCCCCCAGACGAAAATACCAAATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGGGATAGACCTGTGATCCATCGTGATG ++ +CDCCCFFFFFFFGGGGGGGGGGHHHHHHHHHHHHHHHHGHHHHHHHHGHHHHHHGGGGGHGIHHHIH5DEGHHHF?FGHGGHGGHEGGHFHHGHGEHHGGGGGFFFGHFBG2GHEBGHHGHGGEG/GFGABEDFGHEED?GGHHFFGGGCFEGD/GFHFFGEFGCGG?CC??D-EF@EEEFGCDDBBFGGGEBBFFF09090A.BFGA.9CCA0;EBAB00BBFF.@-./;BB;BFFF0:00099AAFFF0 +@M01368:8:000000000-A3GHV:1:1114:2404:13066/2 +ATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGACCATACTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACTGTCTCTTATACACATT ++ +CCCCCFFFFFFCGGGGGGGGGGHHHHHHHHHFFHHHHGGGGGHFFFHHFHHHHHHHHHHHHHHHGFEGGGHGEDFCDFHGHFG@@DGGHHHHHHGGGGCGGGGGEHGGCGBB?CF99EGFGGFGG?D9CFFFF/BBFFFFFEF9BFFAFFFFEFFFFFFFFFFFFFFFFFFFFF.FFBBFFFFFFFFFFFF-9;;;BFFFFFB9BFBFBFABFFEFFFFFFFFFF::BFFBFFFF.9//;FFFFF/BFFB/ +@M01368:8:000000000-A3GHV:1:1114:9184:6959/2 +AAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTCATAAAACCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCCTGTCTCTTA ++ +CCCCBFFFFFFFGGGGGGGGGGHHHHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHEHIHHGGGGHHHHHHHHHHHHHGHHHHHHHHGGGGGHHFHHHHHBGHHHHHHHHHHHHHHHHHGHHHHHGGGGGHHHHGHHHHHHHHHHHHHHHHHGHGHGHHGGGGCFFFFFFFFFFFFFFFFFFFFFFFFFF.CFFFFAF=D=EAEFFF0B:0AF-DAFBFFFFFFFFFBFFFFFFFFFFBFFFEFF9B900B0 diff -r 0f8646f22b8d -r fcadce4d9a06 test-data/bwa-mem-mt-genome.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/bwa-mem-mt-genome.fa Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,239 @@ +>gi|251831106|ref|NC_012920.1| Homo sapiens mitochondrion, complete genome +GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGG +GTATGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTC +CTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTA +ATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATC +ATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCA +AACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCTTTTGGCGGTATGCAC +TTTTAACAGTCACCCCCCAACTAACACATTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATA +CAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAACCAACCAAACCCC +AAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTC +ACATCACCCCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAA +GCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGC +AATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAA +ACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGA +TTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACT +CACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACAC +ACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATC +AACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATC +CCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATA +CCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTC +AAGGTGTAGCCCATGAGGTGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTAT +GAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGA +AGCGCGTACACACCGCCCGTCACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCA +TTTATATAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAACCAGAGTGTA +GCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCAACTTAACTTGACCGCTCTGAGCTAAACCTA +GCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCG +ATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATGAAAAATTATAACCAAGCATA +ATATAGCAAGGACTAACCCCTATACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCC +AAAGCTAAGACCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCA +AAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGAT +AGAATCTTAGTTCAACTTTAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTC +CAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGTAAAAAATTTAACACCCATAG +TAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAAC +ATATAACTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATGTTAGTATAAG +TAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGATTAAAACACTGAACTGACAATTAACAGCCC +AATATCTACAATCAACCAACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGA +AAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGC +ATCACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAA +AGGTAGCATAATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCT +TACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGGCGGGCATAACACAGCAAGACGAGAAGACCCTA +TGGAGCTTTAATTTATTAATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATT +AAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAG +TCAAAGCGAACTACTATACTCAATTGATCCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACA +GCGCAATCCTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCG +ATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGG +AGTAATCCAGGTCGGTTTCTATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCT +ACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATTATACCCACACCCACCCAAGA +ACAGGGTTTGTTAAGATGGCAGAGCCCGGTAATCGCATAAAACTTAAAACTTTACAGTCAGAGGTTCAAT +TCCTCTTCTTAACAACATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAATCGCAATGGCA +TTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATACAACTACGCAAAGGCCCCAACGTTGTAGGCC +CCTACGGGCTACTACAACCCTTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCAC +ATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCGCTCTTCTACTATGAACCCCC +CTCCCCATACCCAACCCCCTGGTCAACCTCAACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAG +CCGTTTACTCAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCGCACTGCGAGC +AGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATCATTCTACTATCAACATTACTAATAAGTGGC +TCCTTTAACCTCTCCACCCTTATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTGG +CCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTCGACCTTGCCGAAGGGGAGTC +CGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATAC +ACAAACATTATTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGACGCACTCTCCC +CTGAACTCTACACAACATATTTTGTCACCAAGACCCTACTTCTAACCTCCCTGTTCTTATGAATTCGAAC +AGCATACCCCCGATTCCGCTACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTA +GCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCCCCCTCAAACCTAAGAAATAT +GTCTGATAAAAGAGTTACTTTGATAGAGTAAATAATAGGAGCTTAAACCCCCTTATTTCTAGGACTATGA +GAATCGAACCCATCCCTGAGAATCCAAAATTCTCCGTGCCACCTATCACACCCCATCCTAAAGTAAGGTC +AGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGGTTATACCCTTCCCGTACTAATTAATCCCCT +GGCCCAACCCGTCATCTACTCTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATTT +TTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCTAACCAAAAAAATAAACCCTC +GTTCCACAGAAGCTGCCATCAAGTATTTCCTCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTAT +CCTCTTCAACAATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACTCATCATTAATA +ATCATAATAGCTATAGCAATAAAACTAGGAATAGCCCCCTTTCACTTCTGAGTCCCAGAGGTTACCCAAG +GCACCCCTCTGACATCCGGCCTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACCA +AATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTATCCATCATAGCAGGCAGTTGA +GGTGGATTAAACCAAACCCAGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAA +TAGCAGTTCTACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATCCTAACTACTAC +CGCATTCCTACTACTCAACTTAAACTCCAGCACCACGACCCTACTACTATCTCGCACCTGAAACAAGCTA +ACATGACTAACACCCTTAATTCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTTT +TGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCATCATCCCCACCATCATAGCCAC +CATCACCCTCCTTAACCTCTACTTCTACCTACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATA +TCTAACAACGTAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCCCACACTCATCG +CCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACTAATAATCTTATAGAAATTTAGGTTAAATAC +AGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTGTAACAGCTAAGGACTGCAAAA +CCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAGCCCTTACTAGACCAATGGGA +CTTAAACCCACAAACACTTAGTTAACAGCTAAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCC +GCCGGGAAAAAAGGCGGGAGAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGA +AAATCACCTCGGAGCTGGTAAAAAGAGGCCTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCA +GCCATTTTACCTCACCCCCACTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGG +AACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCTCTAAGCCTCCTTATTCGAGCC +GAGCTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCACATCTACAACGTTATCGTCACAGCCCATGCAT +TTGTAATAATCTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTAGTTCCCCTAAT +AATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAACATAAGCTTCTGACTCTTACCTCCCTCTCTC +CTACTCCTGCTCGCATCTGCTATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTAG +CAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTCCTTACACCTAGCAGGTGTCTC +CTCTATCTTAGGGGCCATCAATTTCATCACAACAATTATCAATATAAAACCCCCTGCCATAACCCAATAC +CAAACGCCCCTCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTCCCAGTCCTAG +CTGCTGGCATCACTATACTACTAACAGACCGCAACCTCAACACCACCTTCTTCGACCCCGCCGGAGGAGG +AGACCCCATTCTATACCAACACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCA +GGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGAACCATTTGGATACATAGGTA +TGGTCTGAGCTATGATATCAATTGGCTTCCTAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGG +AATAGACGTAGACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAA +GTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATGAAATGATCTGCTGCAGTGCTCTGAGCCCTAG +GATTCATCTTTCTTTTCACCGTAGGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCGT +ACTACACGACACGTACTACGTTGTAGCCCACTTCCACTATGTCCTATCAATAGGAGCTGTATTTGCCATC +ATAGGAGGCTTCATTCACTGATTTCCCCTATTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCC +ATTTCACTATCATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGCCTATCCGGAAT +GCCCCGACGTTACTCGGACTACCCCGATGCATACACCACATGAAACATCCTATCATCTGTAGGCTCATTC +ATTTCTCTAACAGCAGTAATATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTCC +TAATAGTAGAAGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCCCCCACCCTACCACACATTCGA +AGAACCCGTATACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGCTGGTTTCAAGCCAA +CCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGAAAAACCATTTCATAACTTTGTCAAAGTTAAAT +TATAGGCTAAATCCTATATATCTTAATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCT +ATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCC +TGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAAATAGA +AACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTAC +ATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTACTGAACCTACG +AGTACACCGACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGA +CCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACA +TCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTC +TAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGC +AAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTT +ACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAACCTTTTAAGTT +AAAGATTAAGAGAACCAACACCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCAT +AATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATATTAAACACAAACTACCACCTA +CCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCT +GTTCGCTTCATTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCTATTTCCCCCT +CTATTGATCCCCACCTCCAAATATCTCATCAACAACCGACTAATCACCACCCAACAATGACTAATCAAAC +TAACCTCAAAACAAATGATAACCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTT +AATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCATTTACACCAACCACCCAACTA +TCTATAAACCTAGCCATGGCCATCCCCTTATGAGCGGGCACAGTGATTATAGGCTTTCGCTCTAAGATTA +AAAATGCCCTAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAGTTATTATCGA +AACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTACGCCTAACCGCTAACATTACTGCAGGCCAC +CTACTCATGCACCTAATTGGAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCA +TCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTAATCCAAGCCTACGTTTTCAC +ACTTCTAGTAAGCCTCTACCTGCACGACAACACATAATGACCCACCAATCACATGCCTATCATATAGTAA +AACCCAGCCCATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAGCCATGTGATT +TCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACACTAACCATATACCAATGATGG +CGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACG +GGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAG +CCTAGCCCCTACCCCCCAATTAGGAGGGCACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAA +GTCCCACTCCTAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCACCATAGTCTAA +TAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTATTACAATTTTACTGGGTCTCTATTTTACCCT +CCTACAAGCCTCAGAGTACTTCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTTT +GTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCTCACTATCTGCTTCATCCGCC +AACTAATATTTCACTTTACATCCAAACATCACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGT +AGATGTGGTTTGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTAGTATAAATAGT +ACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCAAAAAAGAGTAATAAACTTCGCCTTAATTTTAA +TAATCAACACCCTCCTAGCCTTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACAT +AGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCCCGCGTCCCTTTCTCCATAAAA +TTCTTCTTAGTAGCTATTACCTTCTTATTATTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAG +CCCTACAAACAACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAG +TCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAGTTTAAACAAAACGAAT +GATTTCGACTCATTAAATTATGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTAG +CATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTAGA +AGGAATAATACTATCGCTGTTCATTATAGCTACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAAT +ATTGTGCCTATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCTACTAGTCTCAA +TCTCCAACACATATGGCCTAGACTACGTACATAACCTAAACCTACTCCAATGCTAAAACTAATCGTCCCA +ACAATTATATTACTACCACTGACATGACTTTCCAAAAAACACATAATTTGAATCAACACAACCACCCACA +GCCTAATTATTAGCATCATCCCTCTACTATTTTTTAACCAAATCAACAACAACCTATTTAGCTGTTCCCC +AACCTTTTCCTCCGACCCCCTAACAACCCCCCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATC +ATGGCAAGCCAACGCCACTTATCCAGTGAACCACTATCACGAAAAAAACTCTACCTCTCTATACTAATCT +CCCTACAAATCTCCTTAATTATAACATTCACAGCCACAGAACTAATCATATTTTATATCTTCTTCGAAAC +CACACTTATCCCCACCTTGGCTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACA +TACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCACTAATTTACACTCACAACACCC +TAGGCTCACTAAACATTCTACTACTCACTCTCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTT +AATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCT +AAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCT +ATGGTATAATACGCCTCACACTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTACT +ATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACAGACCTAAAATCGCTCATTGCA +TACTCTTCAATCAGCCACATAGCCCTCGTAGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCG +GCGCAGTCATTCTCATAATCGCCCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAACTA +CGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTT +TGATGACTTCTAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTG +TGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGACTCAACATACTAGTCACAGC +CCTATACTCCCTCTACATATTTACCACAACACAATGGGGCTCACTCACCCACCACATTAACAACATAAAA +CCCTCATTCACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTATCCCTCAACC +CCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAA +CAGAGGCTTACGACCCCTTATTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCCCCATGTCTAAC +AACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAGGCCCCAAAAATTTTGGTGCA +ACTCCAAATAAAAGTAATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCC +ATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCAT +CCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAA +CTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATA +TTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACTGTGATATATAAACTCAGACC +CAAACATTAATCAGTTCTTCAAATATCTACTCATCTTCCTAATTACCATACTAATCTTAGTTACCGCTAA +CAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCATCAGTTGATGA +TACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAATCCTATACAACCGTATCGGCGATATCGGTT +TCATCCTCGCCTTAGCATGATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAA +CGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAGGCAAATCAGCCCAATTAGGT +CTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTA +TAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAAC +TCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATC +AAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAAC +CACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTC +CATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTC +ACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCA +AAGACCACATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGCCCTATCTATTACTCTCATCGC +TACCTCCCTGACAAGCGCCTATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCC +ACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAAACGCCTGGCAGCCGGAAGCC +TATTCGCAGGATTTCTCATTACTAACAACATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTA +CCTAAAACTCACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAACTACCTAACC +AACAAACTTAAAATAAAATCCCCACTATGCACATTTTATTTCTCCAACATACTCGGATTCTACCCTAGCA +TCACACACCGCACAATCCCCTATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCT +AACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCTCCACCTCCATCATCACCTCA +ACCCAAAAAGGCATAATTAAACTTTACTTCCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAA +TCACATAACCTATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGTTCAACCAGTAA +CTACTACTAATCAACGCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAATCAACCCTGA +CCCCTCTCCTTCATAAATTATTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCACCCCATCATAC +TCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAACACTCACCAAGACCTCAACCC +CTGACCCCCATGCCTCAGGATACTCCTCAATAGCCATCGCTGTAGTATATCCAAAGACAACCATCATTCC +CCCTAAATAAATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATTCAGAATAATAACACACCCG +ACCACACCGCTAACAATCAATACTAAACCCCCATAAATAGGAGAAGGCTTAGAAGAAAACCCCACAAACC +CCATTACTAAACCCACACTCAACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGAC +CAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATGACCCCAATACGCAAAACTAAC +CCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAA +ACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTA +CTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATC +ATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGCC +TATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAGC +AACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAAC +TTACTATCCGCCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTACTCAGTAGACA +GTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCTTGCCCTTCATTATTGCAGCCCTAGCAACACT +CCACCTCCTATTCTTGCACGAAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATC +ACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCTTCCTTCTCTCCTTAATGACAT +TAACACTATTCTCACCAGACCTCCTAGGCGACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCC +TCCCCACATCAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCCTAACAAACTA +GGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTAGCAATAATCCCCATCCTCCATATATCCAAAC +AACAAAGCATAATATTTCGCCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTCT +AACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGACAAGTAGCATCCGTACTATAC +TTCACAACAATCCTAATCCTAATACCAACTATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTC +CTTGTAGTATAAACTAATACACCAGTCTTGTAAACCGGAGATGAAAACCTTTTTCCAAGGACAAATCAGA +GAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCTTTC +ATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACA +TTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCA +ATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCACACATCA +ACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAG +TACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCC +TCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCG +CTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTC +ATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATG + diff -r 0f8646f22b8d -r fcadce4d9a06 tool-data/all_fasta.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/all_fasta.loc.sample Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,19 @@ +#This file lists the locations and dbkeys of all the fasta files +#under the "genome" directory (a directory that contains a directory +#for each build). The script extract_fasta.py will generate the file +#all_fasta.loc. This file has the format (white space characters are +#TAB characters): +# +# +# +#So, all_fasta.loc could look something like this: +# +#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa +# +#Your all_fasta.loc file should contain an entry for each individual +#fasta file. So there will be multiple fasta files for each build, +#such as with hg19 above. +# +#Arabidopsis_thaliana Arabidopsis_thaliana_TAIR10 Arabidopsis_thaliana: TAIR 10 /export/home1/users/biocomp/chbernar/galaxy_testing/tool-data/from_personal_data/Fasta/TAIR10.fa diff -r 0f8646f22b8d -r fcadce4d9a06 tool-data/bismark_indexes.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/bismark_indexes.loc.sample Sat May 06 13:18:09 2017 -0400 @@ -0,0 +1,2 @@ +# +#Arabidopsis_thaliana TAIR10 30 Arabidopsis_t_TAIR10_30 Arabidopsis_t_TAIR10_30 Arabidopsis thaliana: TAIR10 /export/home1/users/biocomp/chbernar/galaxy_testing/tool-data/from_personal_data/bismark_indexes/Arabidopsis_thaliana_TAIR10/Bisulfite_Genome \ No newline at end of file diff -r 0f8646f22b8d -r fcadce4d9a06 tool-data/bowtie2_indices.loc.sample --- a/tool-data/bowtie2_indices.loc.sample Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -#This is a sample file distributed with Galaxy that enables tools -#to use a directory of Bowtie2 indexed sequences data files. You will -#need to create these data files and then create a bowtie_indices.loc -#file similar to this one (store it in this directory) that points to -#the directories in which those files are stored. The bowtie2_indices.loc -#file has this format (longer white space characters are TAB characters): -# -# -# -#So, for example, if you had hg18 indexed stored in -#/depot/data2/galaxy/bowtie2/hg18/, -#then the bowtie2_indices.loc entry would look like this: -# -#hg18 hg18 hg18 /depot/data2/galaxy/bowtie2/hg18/hg18 -# -#and your /depot/data2/galaxy/bowtie2/hg18/ directory -#would contain hg18.*.ebwt files: -# -#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt -#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt -#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt -#...etc... -# -#Your bowtie2_indices.loc file should include an entry per line for each -#index set you have stored. The "file" in the path does not actually -#exist, but it is the prefix for the actual index files. For example: -# -#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie2/hg18/hg18canon -#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie2/hg18/hg18full -#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie2/hg19/hg19 -#...etc... -# -#Note that for backwards compatibility with workflows, the unique ID of -#an entry must be the path that was in the original loc file, because that -#is the value stored in the workflow for that parameter. That is why the -#hg19 entry above looks odd. New genomes can be better-looking. -# diff -r 0f8646f22b8d -r fcadce4d9a06 tool-data/bowtie_indices.loc.sample --- a/tool-data/bowtie_indices.loc.sample Wed Feb 11 16:21:51 2015 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -#This is a sample file distributed with Galaxy that enables tools -#to use a directory of Bowtie indexed sequences data files. You will -#need to create these data files and then create a bowtie_indices.loc -#file similar to this one (store it in this directory) that points to -#the directories in which those files are stored. The bowtie_indices.loc -#file has this format (longer white space characters are TAB characters): -# -# -# -#So, for example, if you had hg18 indexed stored in -#/depot/data2/galaxy/bowtie/hg18/, -#then the bowtie_indices.loc entry would look like this: -# -#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 -# -#and your /depot/data2/galaxy/bowtie/hg18/ directory -#would contain hg18.*.ebwt files: -# -#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt -#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt -#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt -#...etc... -# -#Your bowtie_indices.loc file should include an entry per line for each -#index set you have stored. The "file" in the path does not actually -#exist, but it is the prefix for the actual index files. For example: -# -#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie/hg18/hg18canon -#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie/hg18/hg18full -#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie/hg19/hg19 -#...etc... -# -#Note that for backwards compatibility with workflows, the unique ID of -#an entry must be the path that was in the original loc file, because that -#is the value stored in the workflow for that parameter. That is why the -#hg19 entry above looks odd. New genomes can be better-looking. -# diff -r 0f8646f22b8d -r fcadce4d9a06 tool_data_table_conf.xml.sample --- a/tool_data_table_conf.xml.sample Wed Feb 11 16:21:51 2015 -0500 +++ b/tool_data_table_conf.xml.sample Sat May 06 13:18:09 2017 -0400 @@ -1,13 +1,13 @@ - - - value, dbkey, name, path - + +
+ species, version, release, value, dbkey, name, path +
- - + +
value, dbkey, name, path - +
diff -r 0f8646f22b8d -r fcadce4d9a06 tool_dependencies.xml --- a/tool_dependencies.xml Wed Feb 11 16:21:51 2015 -0500 +++ b/tool_dependencies.xml Sat May 06 13:18:09 2017 -0400 @@ -1,12 +1,17 @@ + + + - + - - $REPOSITORY_INSTALL_DIR - + + +