Mercurial > repos > bgruening > bismark
view bismark @ 4:243e8f9fb75b draft
Uploaded
author | bgruening |
---|---|
date | Mon, 09 Feb 2015 18:24:41 -0500 |
parents | 91f07ff056ca |
children |
line wrap: on
line source
#!/usr/bin/perl -- use strict; use warnings; use IO::Handle; use Cwd; $|++; use Getopt::Long; ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. my $parent_dir = getcwd; my $bismark_version = 'v0.10.0'; my $command_line = join (" ",@ARGV); ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail foreach my $arg (@ARGV){ if ($arg eq '--solexa1.3-quals'){ $arg = '--phred64-quals'; } } my @filenames; # will be populated by processing the command line my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line(); my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment my %chromosomes; # stores the chromosome sequences of the mouse genome my %counting; # counting various events my $seqID_contains_tabs; foreach my $filename (@filenames){ chdir $parent_dir or die "Unable to move to initial working directory $!\n"; ### resetting the counting hash and fhs reset_counters_and_fhs($filename); $seqID_contains_tabs = 0; ### PAIRED-END ALIGNMENTS if ($filename =~ ','){ my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file $fhs[0]->{name} = 'CTread1GAread2CTgenome'; $fhs[1]->{name} = 'GAread1CTread2GAgenome'; $fhs[2]->{name} = 'GAread1CTread2CTgenome'; $fhs[3]->{name} = 'CTread1GAread2GAgenome'; warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n"; my ($filename_1,$filename_2) = (split (/,/,$filename)); warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n"; ### additional variables only for paired-end alignments my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file ### FastA format if ($sequence_file_format eq 'FASTA'){ warn "Input files are in FastA format\n"; if ($directional){ ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); $fhs[0]->{inputfile_1} = $C_to_T_infile_1; $fhs[0]->{inputfile_2} = $G_to_A_infile_2; $fhs[1]->{inputfile_1} = undef; $fhs[1]->{inputfile_2} = undef; $fhs[2]->{inputfile_1} = undef; $fhs[2]->{inputfile_2} = undef; $fhs[3]->{inputfile_1} = $C_to_T_infile_1; $fhs[3]->{inputfile_2} = $G_to_A_infile_2; } else{ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); $fhs[0]->{inputfile_1} = $C_to_T_infile_1; $fhs[0]->{inputfile_2} = $G_to_A_infile_2; $fhs[1]->{inputfile_1} = $G_to_A_infile_1; $fhs[1]->{inputfile_2} = $C_to_T_infile_2; $fhs[2]->{inputfile_1} = $G_to_A_infile_1; $fhs[2]->{inputfile_2} = $C_to_T_infile_2; $fhs[3]->{inputfile_1} = $C_to_T_infile_1; $fhs[3]->{inputfile_2} = $G_to_A_infile_2; } if ($bowtie2){ paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); } else{ paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); } } ### FastQ format else{ warn "Input files are in FastQ format\n"; if ($directional){ if ($bowtie2){ ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); $fhs[0]->{inputfile_1} = $C_to_T_infile_1; $fhs[0]->{inputfile_2} = $G_to_A_infile_2; $fhs[1]->{inputfile_1} = undef; $fhs[1]->{inputfile_2} = undef; $fhs[2]->{inputfile_1} = undef; $fhs[2]->{inputfile_2} = undef; $fhs[3]->{inputfile_1} = $C_to_T_infile_1; $fhs[3]->{inputfile_2} = $G_to_A_infile_2; } else{ # Bowtie 1 alignments if ($gzip){ ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format $fhs[0]->{inputfile_2} = undef; # no longer needed $fhs[1]->{inputfile_1} = undef; $fhs[1]->{inputfile_2} = undef; $fhs[2]->{inputfile_1} = undef; $fhs[2]->{inputfile_2} = undef; $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format $fhs[3]->{inputfile_2} = undef; # no longer needed } else{ ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); $fhs[0]->{inputfile_1} = $C_to_T_infile_1; $fhs[0]->{inputfile_2} = $G_to_A_infile_2; $fhs[1]->{inputfile_1} = undef; $fhs[1]->{inputfile_2} = undef; $fhs[2]->{inputfile_1} = undef; $fhs[2]->{inputfile_2} = undef; $fhs[3]->{inputfile_1} = $C_to_T_infile_1; $fhs[3]->{inputfile_2} = $G_to_A_infile_2; } } } elsif($pbat){ # PBAT-Seq ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); $fhs[0]->{inputfile_1} = undef; $fhs[0]->{inputfile_2} = undef; $fhs[1]->{inputfile_1} = $G_to_A_infile_1; $fhs[1]->{inputfile_2} = $C_to_T_infile_2; $fhs[2]->{inputfile_1} = $G_to_A_infile_1; $fhs[2]->{inputfile_2} = $C_to_T_infile_2; $fhs[3]->{inputfile_1} = undef; $fhs[3]->{inputfile_2} = undef; } else{ if ($bowtie2){ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); $fhs[0]->{inputfile_1} = $C_to_T_infile_1; $fhs[0]->{inputfile_2} = $G_to_A_infile_2; $fhs[1]->{inputfile_1} = $G_to_A_infile_1; $fhs[1]->{inputfile_2} = $C_to_T_infile_2; $fhs[2]->{inputfile_1} = $G_to_A_infile_1; $fhs[2]->{inputfile_2} = $C_to_T_infile_2; $fhs[3]->{inputfile_1} = $C_to_T_infile_1; $fhs[3]->{inputfile_2} = $G_to_A_infile_2; } else{ # Bowtie 1 alignments if ($gzip){ ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time $fhs[0]->{inputfile_1} = $C_to_T_infile_1; $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files $fhs[1]->{inputfile_1} = $G_to_A_infile_1; $fhs[1]->{inputfile_2} = undef; $fhs[2]->{inputfile_1} = $G_to_A_infile_1; $fhs[2]->{inputfile_2} = undef; $fhs[3]->{inputfile_1} = $C_to_T_infile_1; $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files } else{ #uncompressed temp files ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); $fhs[0]->{inputfile_1} = $C_to_T_infile_1; $fhs[0]->{inputfile_2} = $G_to_A_infile_2; $fhs[1]->{inputfile_1} = $G_to_A_infile_1; $fhs[1]->{inputfile_2} = $C_to_T_infile_2; $fhs[2]->{inputfile_1} = $G_to_A_infile_1; $fhs[2]->{inputfile_2} = $C_to_T_infile_2; $fhs[3]->{inputfile_1} = $C_to_T_infile_1; $fhs[3]->{inputfile_2} = $G_to_A_infile_2; } } } if ($bowtie2){ paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); } else{ paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); } } start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); } ### Else we are performing SINGLE-END ALIGNMENTS else{ warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n"; ### Initialising bisulfite conversion filenames my ($C_to_T_infile,$G_to_A_infile); ### FastA format if ($sequence_file_format eq 'FASTA'){ warn "Inut file is in FastA format\n"; if ($directional){ ($C_to_T_infile) = biTransformFastAFiles ($filename); $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; } else{ ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename); $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; } ### Creating 4 different bowtie filehandles and storing the first entry if ($bowtie2){ single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile); } else{ single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile); } } ## FastQ format else{ warn "Input file is in FastQ format\n"; if ($directional){ ($C_to_T_infile) = biTransformFastQFiles ($filename); $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; } elsif($pbat){ ($G_to_A_infile) = biTransformFastQFiles ($filename); $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files } else{ ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename); $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; } ### Creating up to 4 different bowtie filehandles and storing the first entry if ($bowtie2){ single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile); } elsif ($pbat){ single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile); } else{ single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile); } } start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile); } } sub start_methylation_call_procedure_single_ends { my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; my ($dir,$filename); if ($sequence_file =~ /\//){ ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/; } else{ $filename = $sequence_file; } ### printing all alignments to a results file my $outfile = $filename; if ($prefix){ $outfile = "$prefix.$outfile"; } if ($bowtie2){ # SAM format is the default for Bowtie 2 $outfile =~ s/$/_bismark_bt2.sam/; } elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) $outfile =~ s/$/_bismark.txt/; } else{ # SAM is the default output $outfile =~ s/$/_bismark.sam/; } $bam = 0 unless (defined $bam); if ($bam == 1){ ### Samtools is installed, writing out BAM directly $outfile =~ s/sam/bam/; open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; } elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead $outfile .= '.gz'; open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; } else{ # uncompressed ouput, default open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; } warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n"; sleep(1); if ($vanilla){ print OUT "Bismark version: $bismark_version\n"; } ### printing alignment and methylation call summary to a report file my $reportfile = $filename; if ($prefix){ $reportfile = "$prefix.$reportfile"; } if ($bowtie2){ $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; } else{ $reportfile =~ s/$/_bismark_SE_report.txt/; } open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; if ($unmapped){ my $unmapped_file = $filename; if ($prefix){ $unmapped_file = "$prefix.$unmapped_file"; } $unmapped_file =~ s/$/_unmapped_reads.txt/; open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; print "Unmapped sequences will be written to $output_dir$unmapped_file\n"; } if ($ambiguous){ my $ambiguous_file = $filename; if ($prefix){ $ambiguous_file = "$prefix.$ambiguous_file"; } $ambiguous_file =~ s/$/_ambiguous_reads.txt/; open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; } if ($directional){ print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n"; } print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time unless (%chromosomes){ my $cwd = getcwd; # storing the path of the current working directory print "Current working directory is: $cwd\n\n"; read_genome_into_memory($cwd); } unless ($vanilla or $sam_no_hd){ generate_SAM_header(); } ### Input file is in FastA format if ($sequence_file_format eq 'FASTA'){ process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); } ### Input file is in FastQ format else{ process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); } } sub start_methylation_call_procedure_paired_ends { my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; my ($dir_1,$filename_1); if ($sequence_file_1 =~ /\//){ ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/; } else{ $filename_1 = $sequence_file_1; } my ($dir_2,$filename_2); if ($sequence_file_2 =~ /\//){ ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/; } else{ $filename_2 = $sequence_file_2; } ### printing all alignments to a results file my $outfile = $filename_1; if ($prefix){ $outfile = "$prefix.$outfile"; } if ($bowtie2){ # SAM format is the default Bowtie 2 output $outfile =~ s/$/_bismark_bt2_pe.sam/; } elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) $outfile =~ s/$/_bismark_pe.txt/; } else{ # SAM format is the default Bowtie 1 output $outfile =~ s/$/_bismark_pe.sam/; } $bam = 0 unless (defined $bam); if ($bam == 1){ ### Samtools is installed, writing out BAM directly $outfile =~ s/sam/bam/; open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; } elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead $outfile .= '.gz'; open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; } else{ # uncompressed ouput, default open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; } warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n"; sleep(1); if ($vanilla){ print OUT "Bismark version: $bismark_version\n"; } ### printing alignment and methylation call summary to a report file my $reportfile = $filename_1; if ($prefix){ $reportfile = "$prefix.$reportfile"; } if ($bowtie2){ $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; } else{ $reportfile =~ s/$/_bismark_PE_report.txt/; } open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n"; print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; ### Unmapped read output if ($unmapped){ my $unmapped_1 = $filename_1; my $unmapped_2 = $filename_2; if ($prefix){ $unmapped_1 = "$prefix.$unmapped_1"; $unmapped_2 = "$prefix.$unmapped_2"; } $unmapped_1 =~ s/$/_unmapped_reads_1.txt/; $unmapped_2 =~ s/$/_unmapped_reads_2.txt/; open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; } if ($ambiguous){ my $amb_1 = $filename_1; my $amb_2 = $filename_2; if ($prefix){ $amb_1 = "$prefix.$amb_1"; $amb_2 = "$prefix.$amb_2"; } $amb_1 =~ s/$/_ambiguous_reads_1.txt/; $amb_2 =~ s/$/_ambiguous_reads_2.txt/; open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; } if ($directional){ print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n"; } ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time unless (%chromosomes){ my $cwd = getcwd; # storing the path of the current working directory print "Current working directory is: $cwd\n\n"; read_genome_into_memory($cwd); } unless ($vanilla or $sam_no_hd){ generate_SAM_header(); } ### Input files are in FastA format if ($sequence_file_format eq 'FASTA'){ process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); } ### Input files are in FastQ format else{ process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); } } sub print_final_analysis_report_single_end{ my ($C_to_T_infile,$G_to_A_infile) = @_; ### All sequences from the original sequence file have been analysed now ### deleting temporary C->T or G->A infiles if ($directional){ my $deletion_successful = unlink "$temp_dir$C_to_T_infile"; if ($deletion_successful == 1){ warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n"; } else{ warn "Could not delete temporary file $C_to_T_infile properly $!\n"; } } elsif ($pbat){ my $deletion_successful = unlink "$temp_dir$G_to_A_infile"; if ($deletion_successful == 1){ warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n"; } else{ warn "Could not delete temporary file $G_to_A_infile properly $!\n"; } } else{ my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile"; if ($deletion_successful == 2){ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n"; } else{ warn "Could not delete temporary files properly $!\n"; } } ### printing a final report for the alignment procedure print REPORT "Final Alignment report\n",'='x22,"\n"; warn "Final Alignment report\n",'='x22,"\n"; # foreach my $index (0..$#fhs){ # print "$fhs[$index]->{name}\n"; # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; # } ### printing a final report for the methylation call procedure warn "Sequences analysed in total:\t$counting{sequences_count}\n"; print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n"; my $percent_alignable_sequences; if ($counting{sequences_count} == 0){ $percent_alignable_sequences = 0; } else{ $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); } warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n"; print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n"; ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads), ### only calculating the percentage if there were any overruled alignments if ($counting{low_complexity_alignments_overruled_count}){ my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count}); # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n"; } print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; print "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; if ($directional){ print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; } ### detailed information about Cs analysed warn "Final Cytosine Methylation Report\n",'='x33,"\n"; my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; warn "Total number of C's analysed:\t$total_number_of_C\n\n"; warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; if ($bowtie2){ warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; } warn "\n"; warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; if ($bowtie2){ warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; } warn "\n"; print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; if ($bowtie2){ print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; } print REPORT "\n"; print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; if ($bowtie2){ print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; } print REPORT "\n"; my $percent_meCHG; if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); } my $percent_meCHH; if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); } my $percent_meCpG; if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); } my $percent_meC_unknown; if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); } ### printing methylated CpG percentage if applicable if ($percent_meCpG){ warn "C methylated in CpG context:\t${percent_meCpG}%\n"; print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; } else{ warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; } ### printing methylated C percentage (CHG context) if applicable if ($percent_meCHG){ warn "C methylated in CHG context:\t${percent_meCHG}%\n"; print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; } else{ warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; } ### printing methylated C percentage (CHH context) if applicable if ($percent_meCHH){ warn "C methylated in CHH context:\t${percent_meCHH}%\n"; print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; } else{ warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; } ### printing methylated C percentage (Unknown C context) if applicable if ($bowtie2){ if ($percent_meC_unknown){ warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; } else{ warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; } } print REPORT "\n\n"; warn "\n\n"; if ($seqID_contains_tabs){ warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; } ########################################################################################################################################### ### create pie-chart with mapping stats ########################################################################################################################################### my $filename; if ($pbat){ $filename = $G_to_A_infile; } else{ $filename = $C_to_T_infile; } my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified $pie_chart =~ s/gz$//; $pie_chart =~ s/_C_to_T\.fastq$//; $pie_chart =~ s/_G_to_A\.fastq$//; # if ($prefix){ # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation # } $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png"; #Check whether the module GD::Graph is installed my $gd_graph_installed = 0; eval{ require GD::Graph::pie; GD::Graph::pie->import(); }; unless($@) { $gd_graph_installed = 1; } else{ warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; sleep(2); } if ($gd_graph_installed){ warn "Generating pie chart\n\n"; sleep(1); my $graph = GD::Graph::pie->new(600,600); my $percent_unaligned; my $percent_multiple; my $percent_unextractable; if ($counting{sequences_count}){ $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); } else{ $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; } my @aln_stats = ( ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], ); $graph->set( start_angle => 180, '3d' => 0, label => 'Alignment stats (single-end)', suppress_angle => 2, # Only label slices of sufficient size transparent => 0, dclrs => [ qw(red lorange dgreen cyan) ], ) or die $graph->error; my $gd = $graph->plot(\@aln_stats) or die $graph->error; open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; binmode PIE; print PIE $gd->png; } warn "====================\nBismark run complete\n====================\n\n"; } sub print_final_analysis_report_paired_ends{ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles if ($directional){ if ($G_to_A_infile_2){ my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2"; if ($deletion_successful == 2){ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n"; } else{ warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n"; } } else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1"; if ($deletion_successful == 1){ warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n"; } else{ warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n"; } } } else{ if ($G_to_A_infile_2 and $C_to_T_infile_2){ my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2"; if ($deletion_successful == 4){ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n"; } else{ warn "Could not delete temporary files properly: $!\n"; } } else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1"; if ($deletion_successful == 2){ warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n"; } else{ warn "Could not delete temporary files properly: $!\n"; } } } ### printing a final report for the alignment procedure warn "Final Alignment report\n",'='x22,"\n"; print REPORT "Final Alignment report\n",'='x22,"\n"; # foreach my $index (0..$#fhs){ # print "$fhs[$index]->{name}\n"; # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; # } ### printing a final report for the methylation call procedure warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; my $percent_alignable_sequence_pairs; if ($counting{sequences_count} == 0){ $percent_alignable_sequence_pairs = 0; } else{ $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); } print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n"; print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n"; print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; ### detailed information about Cs analysed if ($directional){ print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; } warn "Final Cytosine Methylation Report\n",'='x33,"\n"; print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; warn "Total number of C's analysed:\t$total_number_of_C\n\n"; warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; if ($bowtie2){ warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; } warn "\n"; warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; if ($bowtie2){ warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; } warn "\n"; print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; if ($bowtie2){ print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; } print REPORT "\n"; print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; if ($bowtie2){ print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; } print REPORT "\n"; my $percent_meCHG; if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); } my $percent_meCHH; if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); } my $percent_meCpG; if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); } my $percent_meC_unknown; if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); } ### printing methylated CpG percentage if applicable if ($percent_meCpG){ warn "C methylated in CpG context:\t${percent_meCpG}%\n"; print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; } else{ warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; } ### printing methylated C percentage in CHG context if applicable if ($percent_meCHG){ warn "C methylated in CHG context:\t${percent_meCHG}%\n"; print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; } else{ warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; } ### printing methylated C percentage in CHH context if applicable if ($percent_meCHH){ warn "C methylated in CHH context:\t${percent_meCHH}%\n"; print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; } else{ warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; } ### printing methylated C percentage (Unknown C context) if applicable if ($bowtie2){ if ($percent_meC_unknown){ warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; } else{ warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; } } print REPORT "\n\n"; warn "\n\n"; ############################################################################################################################################ ### create pie-chart with mapping stats ########################################################################################################################################### my $filename; if ($pbat){ $filename = $G_to_A_infile_1; } else{ $filename = $C_to_T_infile_1; } my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified $pie_chart =~ s/gz$//; $pie_chart =~ s/_C_to_T.fastq$//; $pie_chart =~ s/_G_to_A.fastq$//; ### special format for gzipped PE Bowtie1 files $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//; $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//; if ($prefix){ # prefix is now being prepended to the temp files already # $pie_chart = "$prefix.$pie_chart"; } $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png"; #Check whether the module GD::Graph is installed my $gd_graph_installed = 0; eval{ require GD::Graph::pie; GD::Graph::pie->import(); }; unless($@) { $gd_graph_installed = 1; } else{ warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; sleep(2); } if ($gd_graph_installed){ warn "Generating pie chart\n\n"; sleep(1); my $graph = GD::Graph::pie->new(600,600); my $percent_unaligned; my $percent_multiple; my $percent_unextractable; if ($counting{sequences_count}){ $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); } else{ $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; } my @aln_stats = ( ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], ); # push @{$mbias_read1[0]},$pos; $graph->set( start_angle => 180, '3d' => 0, label => 'Alignment stats (paired-end)', suppress_angle => 2, # Only label slices of sufficient size transparent => 0, dclrs => [ qw(red lorange dgreen cyan) ], ) or die $graph->error; my $gd = $graph->plot(\@aln_stats) or die $graph->error; open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; binmode PIE; print PIE $gd->png; } warn "====================\nBismark run complete\n====================\n\n"; } sub process_single_end_fastA_file_for_methylation_call{ my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either ### the C->T or G->A version ### gzipped version of the infile if ($sequence_file =~ /\.gz$/){ open (IN,"zcat $sequence_file |") or die $!; } else{ open (IN,$sequence_file) or die $!; } my $count = 0; warn "\nReading in the sequence file $sequence_file\n"; while (1) { # last if ($counting{sequences_count} > 100); my $identifier = <IN>; my $sequence = <IN>; last unless ($identifier and $sequence); $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces ++$count; if ($skip){ next unless ($count > $skip); } if ($upto){ last if ($count > $upto); } $counting{sequences_count}++; if ($counting{sequences_count}%1000000==0) { warn "Processed $counting{sequences_count} sequences so far\n"; } chomp $sequence; chomp $identifier; $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers my $return; if ($bowtie2){ $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier); } else{ $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1 } unless ($return){ $return = 0; } # print the sequence to ambiguous.out if --ambiguous was specified if ($ambiguous and $return == 2){ print AMBIG ">$identifier\n"; print AMBIG "$sequence\n"; } # print the sequence to <unmapped.out> file if --un was specified elsif ($unmapped and $return == 1){ print UNMAPPED ">$identifier\n"; print UNMAPPED "$sequence\n"; } } print "Processed $counting{sequences_count} sequences in total\n\n"; print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); } sub process_single_end_fastQ_file_for_methylation_call{ my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either ### the C->T or G->A version ### gzipped version of the infile if ($sequence_file =~ /\.gz$/){ open (IN,"zcat $sequence_file |") or die $!; } else{ open (IN,$sequence_file) or die $!; } my $count = 0; warn "\nReading in the sequence file $sequence_file\n"; while (1) { my $identifier = <IN>; my $sequence = <IN>; my $identifier_2 = <IN>; my $quality_value = <IN>; last unless ($identifier and $sequence and $identifier_2 and $quality_value); $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces ++$count; if ($skip){ next unless ($count > $skip); } if ($upto){ last if ($count > $upto); } $counting{sequences_count}++; if ($counting{sequences_count}%1000000==0) { warn "Processed $counting{sequences_count} sequences so far\n"; } chomp $sequence; chomp $identifier; chomp $quality_value; $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers my $return; if ($bowtie2){ $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value); } else{ $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1 } unless ($return){ $return = 0; } # print the sequence to ambiguous.out if --ambiguous was specified if ($ambiguous and $return == 2){ print AMBIG "\@$identifier\n"; print AMBIG "$sequence\n"; print AMBIG $identifier_2; print AMBIG "$quality_value\n"; } # print the sequence to <unmapped.out> file if --un was specified elsif ($unmapped and $return == 1){ print UNMAPPED "\@$identifier\n"; print UNMAPPED "$sequence\n"; print UNMAPPED $identifier_2; print UNMAPPED "$quality_value\n"; } } print "Processed $counting{sequences_count} sequences in total\n\n"; print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); } sub process_fastA_files_for_paired_end_methylation_calls{ my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping. ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the ### converted genomes (either the C->T or G->A version) ### gzipped version of the infiles if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; } else{ open (IN1,$sequence_file_1) or die $!; open (IN2,$sequence_file_2) or die $!; } warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one my $count = 0; while (1) { # reading from the first input file my $identifier_1 = <IN1>; my $sequence_1 = <IN1>; # reading from the second input file my $identifier_2 = <IN2>; my $sequence_2 = <IN2>; last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2); $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces $identifier_2 = fix_IDs($identifier_2); ++$count; if ($skip){ next unless ($count > $skip); } if ($upto){ last if ($count > $upto); } $counting{sequences_count}++; if ($counting{sequences_count}%1000000==0) { warn "Processed $counting{sequences_count} sequence pairs so far\n"; } my $orig_identifier_1 = $identifier_1; my $orig_identifier_2 = $identifier_2; chomp $sequence_1; chomp $identifier_1; chomp $sequence_2; chomp $identifier_2; $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers my $return; if ($bowtie2){ $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1); } else{ $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1); } unless ($return){ $return = 0; } # print the sequences to ambiguous_1 and _2 if --ambiguous was specified if ($ambiguous and $return == 2){ print AMBIG_1 $orig_identifier_1; print AMBIG_1 "$sequence_1\n"; print AMBIG_2 $orig_identifier_2; print AMBIG_2 "$sequence_2\n"; } # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified elsif ($unmapped and $return == 1){ print UNMAPPED_1 $orig_identifier_1; print UNMAPPED_1 "$sequence_1\n"; print UNMAPPED_2 $orig_identifier_2; print UNMAPPED_2 "$sequence_2\n"; } } warn "Processed $counting{sequences_count} sequences in total\n\n"; close OUT or die $!; print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); } sub process_fastQ_files_for_paired_end_methylation_calls{ my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments. ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both) ### of the converted genomes (either C->T or G->A version) ### gzipped version of the infiles if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; } else{ open (IN1,$sequence_file_1) or die $!; open (IN2,$sequence_file_2) or die $!; } my $count = 0; warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one while (1) { # reading from the first input file my $identifier_1 = <IN1>; my $sequence_1 = <IN1>; my $ident_1 = <IN1>; # not needed my $quality_value_1 = <IN1>; # not needed # reading from the second input file my $identifier_2 = <IN2>; my $sequence_2 = <IN2>; my $ident_2 = <IN2>; # not needed my $quality_value_2 = <IN2>; # not needed last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2); $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces $identifier_2 = fix_IDs($identifier_2); ++$count; if ($skip){ next unless ($count > $skip); } if ($upto){ last if ($count > $upto); } $counting{sequences_count}++; if ($counting{sequences_count}%1000000==0) { warn "Processed $counting{sequences_count} sequence pairs so far\n"; } my $orig_identifier_1 = $identifier_1; my $orig_identifier_2 = $identifier_2; chomp $sequence_1; chomp $identifier_1; chomp $sequence_2; chomp $identifier_2; chomp $quality_value_1; chomp $quality_value_2; $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID my $return; if ($bowtie2){ $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); } else{ $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); } unless ($return){ $return = 0; } # print the sequences to ambiguous_1 and _2 if --ambiguous was specified if ($ambiguous and $return == 2){ # seq_1 print AMBIG_1 $orig_identifier_1; print AMBIG_1 "$sequence_1\n"; print AMBIG_1 $ident_1; print AMBIG_1 "$quality_value_1\n"; # seq_2 print AMBIG_2 $orig_identifier_2; print AMBIG_2 "$sequence_2\n"; print AMBIG_2 $ident_2; print AMBIG_2 "$quality_value_2\n"; } # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified elsif ($unmapped and $return == 1){ # seq_1 print UNMAPPED_1 $orig_identifier_1; print UNMAPPED_1 "$sequence_1\n"; print UNMAPPED_1 $ident_1; print UNMAPPED_1 "$quality_value_1\n"; # seq_2 print UNMAPPED_2 $orig_identifier_2; print UNMAPPED_2 "$sequence_2\n"; print UNMAPPED_2 $ident_2; print UNMAPPED_2 "$quality_value_2\n"; } } warn "Processed $counting{sequences_count} sequences in total\n\n"; close OUT or die $!; print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); } sub check_bowtie_results_single_end{ my ($sequence,$identifier,$quality_value) = @_; unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout $quality_value = 'I'x(length$sequence); } my %mismatches = (); ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome foreach my $index (0..$#fhs){ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); ### if the sequence we are currently looking at produced an alignment we are doing various things with it if ($fhs[$index]->{last_seq_id} eq $identifier) { ############################################################### ### STEP I Now processing the alignment stored in last_line ### ############################################################### my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier); ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation ### we only continue to extract useful information about this alignment if 1 was returned if ($valid_alignment_found_1 == 1){ ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; unless($mismatch_info){ $mismatch_info = ''; } chomp $mismatch_info; my $chromosome; if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ $chromosome = $mapped_chromosome; } else{ die "Chromosome number extraction failed for $mapped_chromosome\n"; } ### Now extracting the number of mismatches to the converted genome my $number_of_mismatches; if ($mismatch_info eq ''){ $number_of_mismatches = 0; } elsif ($mismatch_info =~ /^\d/){ my @mismatches = split (/,/,$mismatch_info); $number_of_mismatches = scalar @mismatches; } else{ die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n"; } ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table my $alignment_location = join (":",$chromosome,$position); ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index ### number for the found alignment) unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; } $number_of_mismatches = undef; ################################################################################################################################################## ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will ### be returned as $valid_alignment_found and it will then be processed in the next round only. ################################################################################################################################################## my $newline = $fhs[$index]->{fh}-> getline(); if ($newline){ my ($seq_id) = split (/\t/,$newline); $fhs[$index]->{last_seq_id} = $seq_id; $fhs[$index]->{last_line} = $newline; } else { # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line} = undef; next; } my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier); ### we only continue to extract useful information about this second alignment if 1 was returned if ($valid_alignment_found_2 == 1){ ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; unless($mismatch_info){ $mismatch_info = ''; } chomp $mismatch_info; my $chromosome; if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ $chromosome = $mapped_chromosome; } else{ die "Chromosome number extraction failed for $mapped_chromosome\n"; } ### Now extracting the number of mismatches to the converted genome my $number_of_mismatches; if ($mismatch_info eq ''){ $number_of_mismatches = 0; } elsif ($mismatch_info =~ /^\d/){ my @mismatches = split (/,/,$mismatch_info); $number_of_mismatches = scalar @mismatches; } else{ die "Something weird is going on with the mismatch field\n"; } ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table ### extracting the chromosome number from the bowtie output (see above) my $alignment_location = join (":",$chromosome,$position); ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this ### case we are not writing the same entry out a second time. unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; } #################################################################################################################################### #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ### #################################################################################################################################### $newline = $fhs[$index]->{fh}-> getline(); if ($newline){ my ($seq_id) = split (/\t/,$newline); die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier); $fhs[$index]->{last_seq_id} = $seq_id; $fhs[$index]->{last_line} = $newline; next; } else { # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line} = undef; next; } ### still within the 2nd sequence in correct orientation found } ### still withing the 1st sequence in correct orientation found } ### still within the if (last_seq_id eq identifier) condition } ### still within foreach index loop } ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file unless(%mismatches){ $counting{no_single_alignment_found}++; if ($unmapped){ return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified } else{ return; } } ####################################################################################################################################################### ####################################################################################################################################################### ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ### ####################################################################################################################################################### ####################################################################################################################################################### ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) my $sequence_fails = 0; ### Declaring an empty hash reference which will store all information we need for the methylation call my $methylation_call_params; # hash reference! ### sorting in ascending order foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment if (scalar keys %{$mismatches{$mismatch_number}} == 1){ for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence}; $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position}; $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; } } elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){ ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite ### reaction. E.g. ### CAGTCACGCGCGCGCG will become ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition. ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!: ### G->A conversion: ### highly methylated: CAATCACACACACACA ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts. ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment. ### In the above example the number of transliterations required to transform the actual sequence ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment) ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments) ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded. my @three_candidate_seqs; foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){ my $transliterations_performed; if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){ $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT'); } elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){ $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA'); } else{ die "unexpected index number range $!\n"; } push @three_candidate_seqs,{ index =>$mismatches{$mismatch_number}->{$composite_location}->{index}, bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence}, mismatch_number => $mismatch_number, chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome}, position => $mismatches{$mismatch_number}->{$composite_location}->{position}, seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id}, transliterations_performed => $transliterations_performed, }; } ### sorting in ascending order for the lowest number of transliterations performed @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs; my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed}; my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed}; my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed}; # print "$first_array_element\t$second_array_element\t$third_array_element\n"; if (($first_array_element*2) < $second_array_element){ $counting{low_complexity_alignments_overruled_count}++; ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence}; $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome}; $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position}; $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index}; $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n"; } else{ $sequence_fails = 1; } } else{ $sequence_fails = 1; } ### after processing the alignment with the lowest number of mismatches we exit last; } ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions if ($sequence_fails == 1){ $counting{unsuitable_sequence_count}++; if ($ambiguous){ return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified } if ($unmapped){ return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified } else{ return 0; # => exits to next sequence (default) } } ### --DIRECTIONAL ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol if ($directional){ if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; $counting{alignments_rejected_count}++; return 0; } } ### If the sequence has not been rejected so far it will have a unique best alignment $counting{unique_best_alignment_count}++; if ($pbat){ extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params); } else{ extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params); } ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; $counting{genomic_sequence_could_not_be_extracted_count}++; return 0; } ### otherwise we are set to perform the actual methylation call $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value); return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out } sub check_bowtie_results_single_end_bowtie2{ my ($sequence,$identifier,$quality_value) = @_; unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout $quality_value = 'I'x(length$sequence); } # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name. # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n"; my $alignment_ambiguous = 0; my %alignments = (); ### reading from the Bowtie 2 output filehandles foreach my $index (0..$#fhs){ # print "Index: $index\n"; # print "$fhs[$index]->{last_line}\n"; # print "$fhs[$index]->{last_seq_id}\n"; # sleep (1); ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); ### if the sequence we are currently looking at produced an alignment we are doing various things with it # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n"; if ($fhs[$index]->{last_seq_id} eq $identifier) { # SAM format specifications for Bowtie 2 # (1) Name of read that aligned # (2) Sum of all applicable flags. Flags relevant to Bowtie are: # 1 The read is one of a pair # 2 The alignment is one end of a proper paired-end alignment # 4 The read has no reported alignments # 8 The read is one of a pair and has no reported alignments # 16 The alignment is to the reverse reference strand # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand # 64 The read is mate 1 in a pair # 128 The read is mate 2 in a pair # 256 The read has multiple mapping states # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) # (5) Mapping quality (255 means MAPQ is not available) # (6) CIGAR string representation of alignment (* if unavailable) # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. # (10) Read sequence (reverse-complemented if aligned to the reverse strand) # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read. # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10]; ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance if ($flag == 4){ ## reading in the next alignment, which must be the next sequence my $newline = $fhs[$index]->{fh}-> getline(); if ($newline){ chomp $newline; my ($seq_id) = split (/\t/,$newline); $fhs[$index]->{last_seq_id} = $seq_id; $fhs[$index]->{last_line} = $newline; if ($seq_id eq $identifier){ die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; } next; # next instance } else{ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line} = undef; next; } } # if there are one or more proper alignments we can extract the chromosome number my $chromosome; if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ $chromosome = $mapped_chromosome; } else{ die "Chromosome number extraction failed for $mapped_chromosome\n"; } ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string my ($alignment_score,$second_best,$MD_tag); my @fields = split (/\t/,$fhs[$index]->{last_line}); foreach (11..$#fields){ if ($fields[$_] =~ /AS:i:(.*)/){ $alignment_score = $1; } elsif ($fields[$_] =~ /XS:i:(.*)/){ $second_best = $1; } elsif ($fields[$_] =~ /MD:Z:(.*)/){ $MD_tag = $1; } } # warn "First best alignment_score is: '$alignment_score'\n"; # warn "MD tag is: '$MD_tag'\n"; die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag); if (defined $second_best){ # warn "second best alignment_score is: '$second_best'\n\n"; # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether if ($alignment_score == $second_best){ $alignment_ambiguous = 1; ## need to read and discard all additional ambiguous reads until we reach the next sequence until ($fhs[$index]->{last_seq_id} ne $identifier){ my $newline = $fhs[$index]->{fh}-> getline(); if ($newline){ chomp $newline; my ($seq_id) = split (/\t/,$newline); $fhs[$index]->{last_seq_id} = $seq_id; $fhs[$index]->{last_line} = $newline; } else{ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line} = undef; last; # break free in case we have reached the end of the alignment output } } # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; } else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment my $alignment_location = join (":",$chromosome,$position); ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB unless (exists $alignments{$alignment_location}){ $alignments{$alignment_location}->{seq_id} = $id; $alignments{$alignment_location}->{alignment_score} = $alignment_score; $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; $alignments{$alignment_location}->{index} = $index; $alignments{$alignment_location}->{chromosome} = $chromosome; $alignments{$alignment_location}->{position} = $position; $alignments{$alignment_location}->{CIGAR} = $cigar; $alignments{$alignment_location}->{MD_tag} = $MD_tag; } ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence until ($fhs[$index]->{last_seq_id} ne $identifier){ my $newline = $fhs[$index]->{fh}-> getline(); if ($newline){ chomp $newline; my ($seq_id) = split (/\t/,$newline); $fhs[$index]->{last_seq_id} = $seq_id; $fhs[$index]->{last_line} = $newline; } else{ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line} = undef; last; # break free in case we have reached the end of the alignment output } } # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; } } else{ # there is no second best hit, so we can just store this one and read in the next sequence my $alignment_location = join (":",$chromosome,$position); ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB unless (exists $alignments{$alignment_location}){ $alignments{$alignment_location}->{seq_id} = $id; $alignments{$alignment_location}->{alignment_score} = $alignment_score; $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; $alignments{$alignment_location}->{index} = $index; $alignments{$alignment_location}->{chromosome} = $chromosome; $alignments{$alignment_location}->{position} = $position; $alignments{$alignment_location}->{MD_tag} = $MD_tag; $alignments{$alignment_location}->{CIGAR} = $cigar; } my $newline = $fhs[$index]->{fh}-> getline(); if ($newline){ chomp $newline; my ($seq_id) = split (/\t/,$newline); $fhs[$index]->{last_seq_id} = $seq_id; $fhs[$index]->{last_line} = $newline; if ($seq_id eq $identifier){ die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; } } else{ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line} = undef; } } } } ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out. if ($alignment_ambiguous == 1){ $counting{unsuitable_sequence_count}++; ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); # print "$ambiguous_read_output\n"; if ($ambiguous){ return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified } elsif ($unmapped){ return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified } else{ return 0; } } ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file unless(%alignments){ $counting{no_single_alignment_found}++; # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value); # print "$unmapped_read_output\n"; if ($unmapped){ return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified } else{ return 0; # default } } ####################################################################################################################################################### ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) ### alignment score we are discarding the sequence altogether. ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for ### opening (5) and extending (3 per bp) the gap. ####################################################################################################################################################### my $methylation_call_params; # hash reference which will store all information we need for the methylation call my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) ### print contents of %alignments for debugging # if (scalar keys %alignments > 1){ # print "\n******\n"; # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ # print "Loc: $alignment_location\n"; # print "ID: $alignments{$alignment_location}->{seq_id}\n"; # print "AS: $alignments{$alignment_location}->{alignment_score}\n"; # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n"; # print "Index $alignments{$alignment_location}->{index}\n"; # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; # print "pos: $alignments{$alignment_location}->{position}\n"; # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n"; # } # print "\n******\n"; # } ### if there is only 1 entry in the hash with we accept it as the best alignment if (scalar keys %alignments == 1){ for my $unique_best_alignment (keys %alignments){ $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence}; $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position}; $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score}; $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag}; $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR}; } } ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case ### we boot the sequence altogether elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ my $best_alignment_score; my $best_alignment_location; foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){ # print "$alignments{$alignment_location}->{alignment_score}\n"; unless (defined $best_alignment_score){ $best_alignment_score = $alignments{$alignment_location}->{alignment_score}; $best_alignment_location = $alignment_location; # print "setting best alignment score: $best_alignment_score\n"; } else{ ### if the second best alignment has the same alignment score as the first one, the sequence will get booted if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){ # warn "Same alignment score, the sequence will get booted!\n"; $sequence_fails = 1; last; # exiting after the second alignment since we know that the sequence has ambiguous alignments } ### else we are going to store the best alignment for further processing else{ $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence}; $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position}; $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score}; $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag}; $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR}; last; # exiting after processing the second alignment since the sequence produced a unique best alignment } } } } else{ die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";; } ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions if ($sequence_fails == 1){ $counting{unsuitable_sequence_count}++; ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); # print OUT "$ambiguous_read_output\n"; if ($ambiguous){ return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified } elsif ($unmapped){ return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified } else{ return 0; # => exits to next sequence (default) } } ### --DIRECTIONAL ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol if ($directional){ if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; $counting{alignments_rejected_count}++; return 0; } } ### If the sequence has not been rejected so far it has a unique best alignment $counting{unique_best_alignment_count}++; ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params); ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; $counting{genomic_sequence_could_not_be_extracted_count}++; return 0; } ### otherwise we are set to perform the actual methylation call $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value); return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out } sub determine_number_of_transliterations_performed{ my ($sequence,$read_conversion) = @_; my $number_of_transliterations; if ($read_conversion eq 'CT'){ $number_of_transliterations = $sequence =~ tr/C/T/; } elsif ($read_conversion eq 'GA'){ $number_of_transliterations = $sequence =~ tr/G/A/; } else{ die "Read conversion mode of the read was not specified $!\n"; } return $number_of_transliterations; } sub decide_whether_single_end_alignment_is_valid{ my ($index,$identifier) = @_; # extracting from Bowtie 1 format my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1]; ### ensuring that the entry is the correct sequence if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){ ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically ### sensible alignments my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); ### If the orientation was correct can we move on if ($orientation == 1){ return 1; ### 1st possibility for a sequence to pass } ### If the alignment was in the wrong orientation we need to read in a new line elsif($orientation == 0){ my $newline = $fhs[$index]->{fh}->getline(); if ($newline){ ($id,$strand) = (split (/\t/,$newline))[0,1]; ### ensuring that the next entry is still the correct sequence if ($id eq $identifier){ ### checking orientation again $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); ### If the orientation was correct can we move on if ($orientation == 1){ $fhs[$index]->{last_seq_id} = $id; $fhs[$index]->{last_line} = $newline; return 1; ### 2nd possibility for a sequence to pass } ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs elsif ($orientation == 0){ $newline = $fhs[$index]->{fh}->getline(); if ($newline){ my ($seq_id) = split (/\t/,$newline); ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with ### the same fields of the just read next entry die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier); $fhs[$index]->{last_seq_id} = $seq_id; $fhs[$index]->{last_line} = $newline; return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation } else{ # assigning undef to last_seq_id and last_line (end of bowtie output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line} = undef; return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation } } else{ die "The orientation of the alignment must be either correct or incorrect\n"; } } ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs else{ $fhs[$index]->{last_seq_id} = $id; $fhs[$index]->{last_line} = $newline; return 0; # processing the new alignment result only in the next round } } else { # assigning undef to last_seq_id and last_line (end of bowtie output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line} = undef; return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation } } else{ die "The orientation of the alignment must be either correct or incorrect\n"; } } ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round else{ return 0; } } ######################### ### BOWTIE 1 | PAIRED-END ######################### sub check_bowtie_results_paired_ends{ my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 unless ($quality_value_1){ $quality_value_1 = 'I'x(length$sequence_1); } unless ($quality_value_2){ $quality_value_2 = 'I'x(length$sequence_2); } # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; # sleep (1); my %mismatches = (); ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary ### strands are not being reported by specifying --directional foreach my $index (0,3,1,2){ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it if ($fhs[$index]->{last_seq_id} eq $identifier) { # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n"; ################################################################################## ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ### ################################################################################## my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong ### orientation. We only continue to extract useful information about this alignment if 1 was returned if ($valid_alignment_found == 1){ ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself. ### we store the useful information in %mismatches my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; chomp $mismatch_info_1; chomp $mismatch_info_2; ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted my ($chromosome_1,$chromosome_2); if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ $chromosome_1 = $mapped_chromosome_1; } else{ die "Chromosome number extraction failed for $mapped_chromosome_1\n"; } if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ $chromosome_2 = $mapped_chromosome_2; } else{ die "Chromosome number extraction failed for $mapped_chromosome_2\n"; } ### Now extracting the number of mismatches to the converted genome my $number_of_mismatches_1; my $number_of_mismatches_2; if ($mismatch_info_1 eq ''){ $number_of_mismatches_1 = 0; } elsif ($mismatch_info_1 =~ /^\d/){ my @mismatches = split (/,/,$mismatch_info_1); $number_of_mismatches_1 = scalar @mismatches; } else{ die "Something weird is going on with the mismatch field\n"; } if ($mismatch_info_2 eq ''){ $number_of_mismatches_2 = 0; } elsif ($mismatch_info_2 =~ /^\d/){ my @mismatches = split (/,/,$mismatch_info_2); $number_of_mismatches_2 = scalar @mismatches; } else{ die "Something weird is going on with the mismatch field\n"; } ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table die "Position 1 is higher than position 2" if ($position_1 > $position_2); die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index ### number for the found alignment) unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; } ################################################################################################################################################### ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ### ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ### ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ### ### this round ### ################################################################################################################################################### my $newline_1 = $fhs[$index]->{fh}-> getline(); my $newline_2 = $fhs[$index]->{fh}-> getline(); if ($newline_1 and $newline_2){ my ($seq_id_1) = split (/\t/,$newline_1); my ($seq_id_2) = split (/\t/,$newline_2); if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag $fhs[$index]->{last_seq_id} = $seq_id_1; } elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag $fhs[$index]->{last_seq_id} = $seq_id_2; } else{ die "Either read 1 or read 2 needs to end on '/1'\n"; } $fhs[$index]->{last_line_1} = $newline_1; $fhs[$index]->{last_line_2} = $newline_2; } else { # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line_1} = undef; $fhs[$index]->{last_line_2} = undef; next; # jumping to the next index } ### Now processing the entry we just stored in last_line_1 and last_line_2 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation if ($valid_alignment_found == 1){ ### we store the useful information in %mismatches ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7]; ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7]; chomp $mismatch_info_1; chomp $mismatch_info_2; ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted) if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ $chromosome_1 = $mapped_chromosome_1; } else{ die "Chromosome number extraction failed for $mapped_chromosome_1\n"; } if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ $chromosome_2 = $mapped_chromosome_2; } else{ die "Chromosome number extraction failed for $mapped_chromosome_2\n"; } $number_of_mismatches_1=''; $number_of_mismatches_2=''; ### Now extracting the number of mismatches to the converted genome if ($mismatch_info_1 eq ''){ $number_of_mismatches_1 = 0; } elsif ($mismatch_info_1 =~ /^\d/){ my @mismatches = split (/,/,$mismatch_info_1); $number_of_mismatches_1 = scalar @mismatches; } else{ die "Something weird is going on with the mismatch field\n"; } if ($mismatch_info_2 eq ''){ $number_of_mismatches_2 = 0; } elsif ($mismatch_info_2 =~ /^\d/){ my @mismatches = split (/,/,$mismatch_info_2); $number_of_mismatches_2 = scalar @mismatches; } else{ die "Something weird is going on with the mismatch field\n"; } ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table die "position 1 is greater than position 2" if ($position_1 > $position_2); die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); $alignment_location = join(":",$chromosome_1,$position_1,$position_2); ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index ### number for the found alignment) unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; } ############################################################################################################################################### ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ### ############################################################################################################################################### $newline_1 = $fhs[$index]->{fh}-> getline(); $newline_2 = $fhs[$index]->{fh}-> getline(); if ($newline_1 and $newline_2){ my ($seq_id_1) = split (/\t/,$newline_1); my ($seq_id_2) = split (/\t/,$newline_2); if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag $fhs[$index]->{last_seq_id} = $seq_id_1; } if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag $fhs[$index]->{last_seq_id} = $seq_id_2; } $fhs[$index]->{last_line_1} = $newline_1; $fhs[$index]->{last_line_2} = $newline_2; } else { # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line_1} = undef; $fhs[$index]->{last_line_2} = undef; next; # jumping to the next index } ### within the 2nd sequence pair alignment in correct orientation found } ### within the 1st sequence pair alignment in correct orientation found } ### still within the (last_seq_id eq identifier) condition } ### still within foreach index loop } ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file unless(%mismatches){ $counting{no_single_alignment_found}++; return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified } ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) my $sequence_pair_fails = 0; ### Declaring an empty hash reference which will store all information we need for the methylation call my $methylation_call_params; # hash reference! ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n"; foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){ #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n"; # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n"; } if (scalar keys %{$mismatches{$mismatch_number}} == 1){ # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n"; for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ $methylation_call_params->{$identifier}->{seq_id} = $identifier; $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1}; $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}; $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1}; $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}; $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2})); $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1}; $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2}; } } else{ $sequence_pair_fails = 1; } ### after processing the alignment with the lowest number of mismatches we exit last; } ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions if ($sequence_pair_fails == 1){ $counting{unsuitable_sequence_count}++; if ($ambiguous){ return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified } if ($unmapped){ return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified } else{ return 0; # => exits to next sequence (default) } } ### --DIRECTIONAL ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol if ($directional){ if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; $counting{alignments_rejected_count}++; return 0; } } ### If the sequence has not been rejected so far it does have a unique best alignment $counting{unique_best_alignment_count}++; extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params); ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; $counting{genomic_sequence_could_not_be_extracted_count}++; return 0; } if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; $counting{genomic_sequence_could_not_be_extracted_count}++; return 0; } ### otherwise we are set to perform the actual methylation call $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 } ######################### ### BOWTIE 2 | PAIRED-END ######################### sub check_bowtie_results_paired_ends_bowtie2{ my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 unless ($quality_value_1){ $quality_value_1 = 'I'x(length$sequence_1); } unless ($quality_value_2){ $quality_value_2 = 'I'x(length$sequence_2); } # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; my %alignments; my $alignment_ambiguous = 0; ### reading from the Bowtie 2 output filehandles ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary ### strands are not being reported when '--directional' is specified foreach my $index (0,3,1,2){ ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it if ($fhs[$index]->{last_seq_id} eq $identifier) { my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10]; my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10]; # print "Index: $index\t$fhs[$index]->{last_line_1}\n"; # print "Index: $index\t$fhs[$index]->{last_line_2}\n"; # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n"; # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n"; $id_1 =~ s/\/1$//; $id_2 =~ s/\/2$//; # SAM format specifications for Bowtie 2 # (1) Name of read that aligned # (2) Sum of all applicable flags. Flags relevant to Bowtie are: # 1 The read is one of a pair # 2 The alignment is one end of a proper paired-end alignment # 4 The read has no reported alignments # 8 The read is one of a pair and has no reported alignments # 16 The alignment is to the reverse reference strand # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand # 64 The read is mate 1 in a pair # 128 The read is mate 2 in a pair # 256 The read has multiple mapping states # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) # (5) Mapping quality (255 means MAPQ is not available) # (6) CIGAR string representation of alignment (* if unavailable) # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. # (10) Read sequence (reverse-complemented if aligned to the reverse strand) # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read. # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128). ### We can store the next alignment and move on to the next Bowtie 2 instance if ($flag_1 == 77 and $flag_2 == 141){ ## reading in the next alignment, which must be the next sequence my $newline_1 = $fhs[$index]->{fh}-> getline(); my $newline_2 = $fhs[$index]->{fh}-> getline(); if ($newline_1 and $newline_2){ chomp $newline_1; chomp $newline_2; my ($seq_id_1) = split (/\t/,$newline_1); my ($seq_id_2) = split (/\t/,$newline_2); $seq_id_1 =~ s/\/1$//; $seq_id_2 =~ s/\/2$//; $fhs[$index]->{last_seq_id} = $seq_id_1; $fhs[$index]->{last_line_1} = $newline_1; $fhs[$index]->{last_line_2} = $newline_2; # print "current sequence ($identifier) did not map, reading in next sequence\n"; # print "$index\t$fhs[$index]->{last_seq_id}\n"; # print "$index\t$fhs[$index]->{last_line_1}\n"; # print "$index\t$fhs[$index]->{last_line_2}\n"; next; # next instance } else{ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line_1} = undef; $fhs[$index]->{last_line_2} = undef; next; } } ### If there are one or more proper alignments we can extract the chromosome number my ($chromosome_1,$chromosome_2); if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ $chromosome_1 = $mapped_chromosome_1; } else{ die "Chromosome number extraction failed for $mapped_chromosome_1\n"; } if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ $chromosome_2 = $mapped_chromosome_2; } else{ die "Chromosome number extraction failed for $mapped_chromosome_2\n"; } die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2); my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1}); my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2}); foreach (11..$#fields_1){ if ($fields_1[$_] =~ /AS:i:(.*)/){ $alignment_score_1 = $1; } elsif ($fields_1[$_] =~ /XS:i:(.*)/){ $second_best_1 = $1; } elsif ($fields_1[$_] =~ /MD:Z:(.*)/){ $MD_tag_1 = $1; } } foreach (11..$#fields_2){ if ($fields_2[$_] =~ /AS:i:(.*)/){ $alignment_score_2 = $1; } elsif ($fields_2[$_] =~ /XS:i:(.*)/){ $second_best_2 = $1; } elsif ($fields_2[$_] =~ /MD:Z:(.*)/){ $MD_tag_2 = $1; } } die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1); die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2); # warn "First read 1 alignment score is: '$alignment_score_1'\n"; # warn "First read 2 alignment score is: '$alignment_score_2'\n"; # warn "MD tag 1 is: '$MD_tag_1'\n"; # warn "MD tag 2 is: '$MD_tag_2'\n"; ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ; # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n"; if (defined $second_best_1 and defined $second_best_2){ my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2; # warn "Second best alignment_score_1 is: '$second_best_1'\n"; # warn "Second best alignment_score_2 is: '$second_best_2'\n"; # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n"; # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){ $alignment_ambiguous = 1; # print "This read will be chucked (AS==XS detected)!\n"; ## need to read and discard all additional ambiguous reads until we reach the next sequence until ($fhs[$index]->{last_seq_id} ne $identifier){ my $newline_1 = $fhs[$index]->{fh}-> getline(); my $newline_2 = $fhs[$index]->{fh}-> getline(); if ($newline_1 and $newline_2){ chomp $newline_1; chomp $newline_2; my ($seq_id_1) = split (/\t/,$newline_1); my ($seq_id_2) = split (/\t/,$newline_2); $seq_id_1 =~ s/\/1$//; $seq_id_2 =~ s/\/2$//; # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; $fhs[$index]->{last_seq_id} = $seq_id_1; $fhs[$index]->{last_line_1} = $newline_1; $fhs[$index]->{last_line_2} = $newline_2; } else{ # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line_1} = undef; $fhs[$index]->{last_line_2} = undef; last; # break free if the end of the alignment output was reached } } # if ($fhs[$index]->{last_seq_id}){ # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; # } } else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment my $alignment_location; if ($position_1 <= $position_2){ $alignment_location = join(":",$chromosome_1,$position_1,$position_2); } elsif($position_2 < $position_1){ $alignment_location = join(":",$chromosome_1,$position_2,$position_1); } ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB unless (exists $alignments{$alignment_location}){ $alignments{$alignment_location}->{seq_id} = $id_1; $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; $alignments{$alignment_location}->{index} = $index; $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine $alignments{$alignment_location}->{position_1} = $position_1; $alignments{$alignment_location}->{position_2} = $position_2; $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; $alignments{$alignment_location}->{flag_1} = $flag_1; $alignments{$alignment_location}->{flag_2} = $flag_2; } # warn "added best of several alignments to \%alignments hash\n"; ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence until ($fhs[$index]->{last_seq_id} ne $identifier){ my $newline_1 = $fhs[$index]->{fh}-> getline(); my $newline_2 = $fhs[$index]->{fh}-> getline(); if ($newline_1 and $newline_2){ chomp $newline_1; chomp $newline_2; my ($seq_id_1) = split (/\t/,$newline_1); my ($seq_id_2) = split (/\t/,$newline_2); $seq_id_1 =~ s/\/1$//; $seq_id_2 =~ s/\/2$//; # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; $fhs[$index]->{last_seq_id} = $seq_id_1; $fhs[$index]->{last_line_1} = $newline_1; $fhs[$index]->{last_line_2} = $newline_2; } else{ # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line_1} = undef; $fhs[$index]->{last_line_2} = undef; last; # break free if the end of the alignment output was reached } } # if($fhs[$index]->{last_seq_id}){ # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n"; # } } } else{ # there is no second best hit, so we can just store this one and read in the next sequence my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); # print "$alignment_location\n"; ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB unless (exists $alignments{$alignment_location}){ $alignments{$alignment_location}->{seq_id} = $id_1; $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; $alignments{$alignment_location}->{index} = $index; $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine $alignments{$alignment_location}->{position_1} = $position_1; $alignments{$alignment_location}->{position_2} = $position_2; $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; $alignments{$alignment_location}->{flag_1} = $flag_1; $alignments{$alignment_location}->{flag_2} = $flag_2; } # warn "added unique alignment to \%alignments hash\n"; # Now reading and storing the next read pair my $newline_1 = $fhs[$index]->{fh}-> getline(); my $newline_2 = $fhs[$index]->{fh}-> getline(); if ($newline_1 and $newline_2){ chomp $newline_1; chomp $newline_2; # print "$newline_1\n"; # print "$newline_2\n"; my ($seq_id_1) = split (/\t/,$newline_1); my ($seq_id_2) = split (/\t/,$newline_2); $seq_id_1 =~ s/\/1$//; $seq_id_2 =~ s/\/2$//; # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; $fhs[$index]->{last_seq_id} = $seq_id_1; $fhs[$index]->{last_line_1} = $newline_1; $fhs[$index]->{last_line_2} = $newline_2; if ($seq_id_1 eq $identifier){ die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; } } else{ # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line_1} = undef; $fhs[$index]->{last_line_2} = undef; } } } } ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format if ($alignment_ambiguous == 1){ $counting{unsuitable_sequence_count}++; ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); # print "$ambiguous_read_1\n"; # print "$ambiguous_read_2\n"; if ($ambiguous){ return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified } elsif ($unmapped){ return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified } else{ return 0; } } ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file unless (%alignments){ $counting{no_single_alignment_found}++; # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); # print "$unmapped_read_1\n"; # print "$unmapped_read_2\n"; if ($unmapped){ return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified } else{ return 0; } } ####################################################################################################################################################### ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) ### alignment score we are discarding the sequence pair altogether. ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5) ### and extending (3 per bp) the gap. ####################################################################################################################################################### ### Declaring an empty hash reference which will store all information we need for the methylation call my $methylation_call_params; # hash reference my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) ### print contents of %alignments for debugging ## if (scalar keys %alignments >= 1){ # print "\n******\n"; # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ # print "Loc: $alignment_location\n"; # print "ID: $alignments{$alignment_location}->{seq_id}\n"; # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n"; # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n"; # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n"; # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n"; # print "Index $alignments{$alignment_location}->{index}\n"; # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; # print "Pos_1: $alignments{$alignment_location}->{position_1}\n"; # print "Pos_2: $alignments{$alignment_location}->{position_2}\n"; # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n"; # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n"; # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n"; # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n"; # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n"; # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n"; # } # print "\n******\n"; # } ### if there is only 1 entry in the %alignments hash we accept it as the best alignment if (scalar keys %alignments == 1){ for my $unique_best_alignment (keys %alignments){ $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1}; $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2}; $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1}; $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2}; $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1}; $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2}; $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores}; $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1}; $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2}; $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1}; $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2}; $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1}; $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2}; } } ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case ### we boot the sequence pair altogether) elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ my $best_sum_of_alignment_scores; my $best_alignment_location; foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){ # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; unless (defined $best_sum_of_alignment_scores){ $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores}; $best_alignment_location = $alignment_location; # print "setting best alignment score to: $best_sum_of_alignment_scores\n"; } else{ ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){ # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n"; $sequence_pair_fails = 1; last; # exiting since we know that the sequence has ambiguous alignments } ### else we are going to store the best alignment for further processing else{ $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1}; $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2}; $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1}; $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2}; $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1}; $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2}; $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores}; $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1}; $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2}; $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1}; $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2}; $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1}; $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2}; last; # exiting since the sequence produced a unique best alignment } } } } else{ die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";; } ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions if ($sequence_pair_fails == 1){ $counting{unsuitable_sequence_count}++; ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); # print "$ambiguous_read_1\n"; # print "$ambiguous_read_2\n"; if ($ambiguous){ return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified } elsif ($unmapped){ return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified } else{ return 0; # => exits to next sequence pair (default) } } ### --DIRECTIONAL ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol if ($directional){ if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; $counting{alignments_rejected_count}++; return 0; } } ### If the sequence pair has not been rejected so far it does have a unique best alignment $counting{unique_best_alignment_count}++; extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; $counting{genomic_sequence_could_not_be_extracted_count}++; return 0; } if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; $counting{genomic_sequence_could_not_be_extracted_count}++; return 0; } ### now we are set to perform the actual methylation call $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n"; # print " $sequence_2\n"; # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n"; # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n"; print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 } ### sub decide_whether_paired_end_alignment_is_valid{ my ($index,$identifier) = @_; my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; chomp $mismatch_info_1; chomp $mismatch_info_2; my $seq_id_1 = $id_1; my $seq_id_2 = $id_2; $seq_id_1 =~ s/\/1$//; # removing the read /1 $seq_id_2 =~ s/\/1$//; # removing the read /1 ### ensuring that the current entry is the correct sequence if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically ### sensible alignments my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); ### If the orientation was correct can we move on if ($orientation == 1){ return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS } ### If the alignment was in the wrong orientation we need to read in two new lines elsif($orientation == 0){ my $newline_1 = $fhs[$index]->{fh}->getline(); my $newline_2 = $fhs[$index]->{fh}->getline(); if ($newline_1 and $newline_2){ ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time) ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1]; ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1]; my $seqid; $seq_id_1 = $id_1; $seq_id_2 = $id_2; # we need to capture the first read (ending on /1) if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag $seqid = $seq_id_1; } elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag $seqid = $seq_id_2; } else{ die "One of the two reads needs to end on /1!!"; } ### ensuring that the next entry is still the correct sequence if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ ### checking orientation again $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); ### If the orientation was correct can we move on if ($orientation == 1){ ### Writing the current sequence to last_line_1 and last_line_2 $fhs[$index]->{last_seq_id} = $seqid; $fhs[$index]->{last_line_1} = $newline_1; $fhs[$index]->{last_line_2} = $newline_2; return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS } ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be ### the next entry) elsif ($orientation == 0){ $newline_1 = $fhs[$index]->{fh}->getline(); $newline_2 = $fhs[$index]->{fh}->getline(); if ($newline_1 and $newline_2){ ($seq_id_1) = split (/\t/,$newline_1); ($seq_id_2) = split (/\t/,$newline_2); $seqid = ''; if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag $seqid = $seq_id_1; } elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag $seqid = $seq_id_2; } else{ die "One of the two reads needs to end on /1!!"; } ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with ### the same fields of the just read next entry die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier); $fhs[$index]->{last_seq_id} = $seqid; $fhs[$index]->{last_line_1} = $newline_1; $fhs[$index]->{last_line_2} = $newline_2; return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation } else { ### assigning undef to last_seq_id and last_line (end of bowtie output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line_1} = undef; $fhs[$index]->{last_line_2} = undef; return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation } } else{ die "The orientation of the alignment must be either correct or incorrect\n"; } } ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs else{ $fhs[$index]->{last_seq_id} = $seqid; $fhs[$index]->{last_line_1} = $newline_1; $fhs[$index]->{last_line_2} = $newline_2; return 0; # processing the new alignment result only in the next round } } else { # assigning undef to last_seq_id and both last_lines (end of bowtie output) $fhs[$index]->{last_seq_id} = undef; $fhs[$index]->{last_line_1} = undef; $fhs[$index]->{last_line_2} = undef; return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation } } else{ die "The orientation of the alignment must be either correct or incorrect\n"; } } ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round else{ return 0; } } ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END sub extract_corresponding_genomic_sequence_paired_ends { my ($sequence_identifier,$methylation_call_params) = @_; ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call my $alignment_read_1; my $alignment_read_2; my $read_conversion_info_1; my $read_conversion_info_2; my $genome_conversion; ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call ### if the C happens to be at the first or last position of the actually observed sequence my $non_bisulfite_sequence_1; my $non_bisulfite_sequence_2; ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic ### sequences around! ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ ### [Index 0, sequence originated from (converted) forward strand] $counting{CT_GA_CT_count}++; $alignment_read_1 = '+'; $alignment_read_2 = '-'; $read_conversion_info_1 = 'CT'; $read_conversion_info_2 = 'GA'; $genome_conversion = 'CT'; ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) ### for hits on the forward strand we need to capture 2 extra bases at the 3' end $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### the reverse strand sequence needs to be reverse complemented $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); } else{ $non_bisulfite_sequence_2 = ''; } } ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ ### [Index 1, sequence originated from complementary to (converted) reverse strand] $counting{GA_CT_GA_count}++; $alignment_read_1 = '+'; $alignment_read_2 = '-'; $read_conversion_info_1 = 'GA'; $read_conversion_info_2 = 'CT'; $genome_conversion = 'GA'; ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2 } else{ $non_bisulfite_sequence_1 = ''; } ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2 ### the reverse strand sequence needs to be reverse complemented $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); } ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ ### [Index 2, sequence originated from the complementary to (converted) forward strand] $counting{GA_CT_CT_count}++; $alignment_read_1 = '-'; $alignment_read_2 = '+'; $read_conversion_info_1 = 'GA'; $read_conversion_info_2 = 'CT'; $genome_conversion = 'CT'; ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2 ### the reverse strand sequence needs to be reverse complemented $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! ### Read 2 is CT converted so we need to capture 2 extra 3' bases if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2 } else{ $non_bisulfite_sequence_2 = ''; } } ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ ### [Index 3, sequence originated from the (converted) reverse strand] $counting{CT_GA_GA_count}++; $alignment_read_1 = '-'; $alignment_read_2 = '+'; $read_conversion_info_1 = 'CT'; $read_conversion_info_2 = 'GA'; $genome_conversion = 'GA'; ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2 ### the reverse strand sequence needs to be reverse complemented $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); } else{ $non_bisulfite_sequence_1 = ''; } ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! ### Read 2 is GA converted so we need to capture 2 extra 5' bases $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2 } else{ die "Too many bowtie result filehandles\n"; } ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; } ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{ my ($sequence_identifier,$methylation_call_params) = @_; ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; # sleep(10); ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions my $alignment_read_1; my $alignment_read_2; my $read_conversion_info_1; my $read_conversion_info_2; my $genome_conversion; ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call ### if the C happens to be at the last position of the actually observed sequence my $non_bisulfite_sequence_1 = ''; my $non_bisulfite_sequence_2 = ''; ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1; my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1; # parsing CIGAR 1 string my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation my @ops_1 = split (/\d+/,$cigar_1); # storing the operation shift @ops_1; # remove the empty first element die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); # parsing CIGAR 2 string my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation my @ops_2 = split (/\d+/,$cigar_2); # storing the operation shift @ops_2; # remove the empty first element die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output my $indels_2 = 0; ### Extracting read 1 genomic sequence ### # extracting 2 additional bp at the 5' end (read 1) if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; return; } $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2); } foreach (0..$#len_1){ if ($ops_1[$_] eq 'M'){ # extracting genomic sequence $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); # warn "$non_bisulfite_sequence_1\n"; # adjusting position $pos_1 += $len_1[$_]; } elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls $non_bisulfite_sequence_1 .= 'N' x $len_1[$_]; # warn "$non_bisulfite_sequence_1\n"; # position doesn't need adjusting $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output } elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence # we do not add any genomic sequence but only adjust the position # warn "Just adjusting the position by: ",$len_1[$_],"bp\n"; $pos_1 += $len_1[$_]; $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output } elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; } else{ die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; } } ### 3' end of read 1 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; return; } $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); } ### Extracting read 2 genomic sequence ### ### 5' end of read 2 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; return; } $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2); } foreach (0..$#len_2){ if ($ops_2[$_] eq 'M'){ # extracting genomic sequence $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); # warn "$non_bisulfite_sequence_2\n"; # adjusting position $pos_2 += $len_2[$_]; } elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls $non_bisulfite_sequence_2 .= 'N' x $len_2[$_]; # warn "$non_bisulfite_sequence_2\n"; # position doesn't need adjusting $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output } elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence # we do not add any genomic sequence but only adjust the position # warn "Just adjusting the position by: ",$len_2[$_],"bp\n"; $pos_2 += $len_2[$_]; $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output } elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; } else{ die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; } } ### 3' end of read 2 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise # need to set read 1 as well now to prevent warning # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; # sleep(5); $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; return; } $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); } ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ ### [Index 0, sequence originated from (converted) forward strand] $counting{CT_GA_CT_count}++; $alignment_read_1 = '+'; $alignment_read_2 = '-'; $read_conversion_info_1 = 'CT'; $read_conversion_info_2 = 'GA'; $genome_conversion = 'CT'; ### Read 1 is always the forward hit ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); } ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ ### [Index 1, sequence originated from complementary to (converted) bottom strand] $counting{GA_CT_GA_count}++; $alignment_read_1 = '+'; $alignment_read_2 = '-'; $read_conversion_info_1 = 'GA'; $read_conversion_info_2 = 'CT'; $genome_conversion = 'GA'; ### Read 1 is always the forward hit ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); } ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ ### [Index 2, sequence originated from the complementary to (converted) top strand] $counting{GA_CT_CT_count}++; $alignment_read_1 = '-'; $alignment_read_2 = '+'; $read_conversion_info_1 = 'GA'; $read_conversion_info_2 = 'CT'; $genome_conversion = 'CT'; ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); } ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ ### [Index 3, sequence originated from the (converted) reverse strand] $counting{CT_GA_GA_count}++; $alignment_read_1 = '-'; $alignment_read_2 = '+'; $read_conversion_info_1 = 'CT'; $read_conversion_info_2 = 'GA'; $genome_conversion = 'GA'; ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); } else{ die "Too many bowtie result filehandles\n"; } ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; ## the end position of a read is stored in $pos $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1; $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2; $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1; $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2; } ########################################## ### PRINT SINGLE END RESULTS: Bowtie 1 ### ########################################## sub print_bisulfite_mapping_result_single_end{ my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) if ($phred64){ $quality_value = convert_phred64_quals_to_phred33($quality_value); } elsif ($solexa){ $quality_value = convert_solexa_quals_to_phred33($quality_value); } ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position. $methylation_call_params->{$identifier}->{position} += 1; ### writing every uniquely mapped read and its methylation call to the output file if ($vanilla){ my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value); print OUT "$bowtie1_output\n"; } else{ # SAM output, default since Bismark v1.0.0 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script } } ########################################## ### PRINT SINGLE END RESULTS: Bowtie 2 ### ########################################## sub print_bisulfite_mapping_result_single_end_bowtie2{ my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) if ($phred64){ $quality_value = convert_phred64_quals_to_phred33($quality_value); } elsif ($solexa){ $quality_value = convert_solexa_quals_to_phred33($quality_value); } ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed) single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script } ########################################## ### PRINT PAIRED END ESULTS: Bowtie 1 ### ########################################## sub print_bisulfite_mapping_results_paired_ends{ my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) if ($phred64){ $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); } elsif ($solexa){ $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); } ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based) $methylation_call_params->{$identifier}->{start_seq_1} += 1; ### writing every single aligned read and its methylation call to the output file if ($vanilla){ my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2); print OUT "$bowtie1_output_paired_end\n"; } else{ # SAM output, default since Bismark v1.0.0 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script } } ########################################## ### PRINT PAIRED END ESULTS: Bowtie 2 ### ########################################## sub print_bisulfite_mapping_results_paired_ends_bowtie2{ my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) if ($phred64){ $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); } elsif ($solexa){ $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); } ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed) paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script } sub convert_phred64_quals_to_phred33{ my $qual = shift; my @quals = split (//,$qual); my @new_quals; foreach my $index (0..$#quals){ my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]); my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); $new_quals[$index] = $phred33_quality_string; } my $phred33_quality = join ("",@new_quals); return $phred33_quality; } sub convert_solexa_quals_to_phred33{ my $qual = shift; my @quals = split (//,$qual); my @new_quals; foreach my $index (0..$#quals){ my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]); my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); $new_quals[$index] = $phred33_quality_string; } my $phred33_quality = join ("",@new_quals); return $phred33_quality; } sub convert_phred_score_into_phred33_quality_string{ my $qual = shift; $qual = chr($qual+33); return $qual; } sub convert_phred64_quality_string_into_phred_score{ my $string = shift; my $qual = ord($string)-64; return $qual; } sub convert_solexa_pre1_3_quality_string_into_phred_score{ ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10 my $string = shift; my $qual = ord($string)-59; return $qual; } sub extract_corresponding_genomic_sequence_single_end { my ($sequence_identifier,$methylation_call_params) = @_; ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions my $alignment_strand; my $read_conversion_info; my $genome_conversion; ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, ### if the C happens to be at the last position of the actually observed sequence my $non_bisulfite_sequence; ### depending on the conversion we want to make need to capture 1 extra base at the 3' end ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ ### [Index 0, sequence originated from (converted) forward strand] $counting{CT_CT_count}++; $alignment_strand = '+'; $read_conversion_info = 'CT'; $genome_conversion = 'CT'; ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 ### + 2 extra base at the 3' end $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 } else{ $non_bisulfite_sequence = ''; } } ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ ### [Index 1, sequence originated from (converted) reverse strand] $counting{CT_GA_count}++; $alignment_strand = '-'; $read_conversion_info = 'CT'; $genome_conversion = 'GA'; ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 ## reverse complement! $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); } else{ $non_bisulfite_sequence = ''; } } ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ ### [Index 2, sequence originated from complementary to (converted) forward strand] $counting{GA_CT_count}++; $alignment_strand = '-'; $read_conversion_info = 'GA'; $genome_conversion = 'CT'; ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 ## reverse complement! $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); } else{ $non_bisulfite_sequence = ''; } } ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ ### [Index 3, sequence originated from complementary to (converted) reverse strand] $counting{GA_GA_count}++; $alignment_strand = '+'; $read_conversion_info = 'GA'; $genome_conversion = 'GA'; ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 } else{ $non_bisulfite_sequence = ''; } } else{ die "Too many bowtie result filehandles\n"; } $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; ### at this point we can also determine the end position of a read $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); } sub extract_corresponding_genomic_sequence_single_end_pbat { my ($sequence_identifier,$methylation_call_params) = @_; ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions my $alignment_strand; my $read_conversion_info; my $genome_conversion; ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, ### if the C happens to be at the last position of the actually observed sequence my $non_bisulfite_sequence; ### depending on the conversion we want to make need to capture 1 extra base at the 3' end my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1! ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) if ($pbat_index == 0){ ### [Index 0, sequence originated from (converted) forward strand] $counting{CT_CT_count}++; $alignment_strand = '+'; $read_conversion_info = 'CT'; $genome_conversion = 'CT'; ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 ### + 2 extra base at the 3' end $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 } else{ $non_bisulfite_sequence = ''; } } ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) elsif ($pbat_index == 1){ ### [Index 1, sequence originated from (converted) reverse strand] $counting{CT_GA_count}++; $alignment_strand = '-'; $read_conversion_info = 'CT'; $genome_conversion = 'GA'; ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 ## reverse complement! $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); } else{ $non_bisulfite_sequence = ''; } } ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) elsif ($pbat_index == 2){ ### [Index 2, sequence originated from complementary to (converted) forward strand] $counting{GA_CT_count}++; $alignment_strand = '-'; $read_conversion_info = 'GA'; $genome_conversion = 'CT'; ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 ## reverse complement! $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); } else{ $non_bisulfite_sequence = ''; } } ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) elsif ($pbat_index == 3){ ### [Index 3, sequence originated from complementary to (converted) reverse strand] $counting{GA_GA_count}++; $alignment_strand = '+'; $read_conversion_info = 'GA'; $genome_conversion = 'GA'; ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 } else{ $non_bisulfite_sequence = ''; } } else{ die "Too many bowtie result filehandles\n"; } $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; ### at this point we can also determine the end position of a read $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); } sub extract_corresponding_genomic_sequence_single_end_bowtie2{ my ($sequence_identifier,$methylation_call_params) = @_; my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info}; my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR}; ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions my $alignment_strand; my $read_conversion_info; my $genome_conversion; ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence my $non_bisulfite_sequence = ''; ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1; # parsing CIGAR string my @len = split (/\D+/,$cigar); # storing the length per operation my @ops = split (/\d+/,$cigar); # storing the operation shift @ops; # remove the empty first element die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3) if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; return; } $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2); } my $indels = 0; foreach (0..$#len){ if ($ops[$_] eq 'M'){ #extracting genomic sequence $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); # adjusting position $pos += $len[$_]; } elsif ($ops[$_] eq 'I'){ # insertion in the read sequence # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls $non_bisulfite_sequence .= 'N' x $len[$_]; # warn "$non_bisulfite_sequence\n"; # position doesn't need to be adjusting $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) } elsif ($ops[$_] eq 'D'){ # deletion in the read sequence # we do not add any genomic sequence but only adjust the position $pos += $len[$_]; $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) } elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; } else{ die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; } } ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2) if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; return; } $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2); # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n"; } ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ ### [Index 0, sequence originated from (converted) forward strand] $counting{CT_CT_count}++; $alignment_strand = '+'; $read_conversion_info = 'CT'; $genome_conversion = 'CT'; } ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ ### [Index 1, sequence originated from (converted) reverse strand] $counting{CT_GA_count}++; $alignment_strand = '-'; $read_conversion_info = 'CT'; $genome_conversion = 'GA'; ### reverse complement! $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); } ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ ### [Index 2, sequence originated from complementary to (converted) forward strand] $counting{GA_CT_count}++; $alignment_strand = '-'; $read_conversion_info = 'GA'; $genome_conversion = 'CT'; ### reverse complement! $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); } ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ ### [Index 3, sequence originated from complementary to (converted) reverse strand] $counting{GA_GA_count}++; $alignment_strand = '+'; $read_conversion_info = 'GA'; $genome_conversion = 'GA'; } else{ die "Too many Bowtie 2 result filehandles\n"; } $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; ### the end position of a read is stored in $pos $methylation_call_params->{$sequence_identifier}->{end_position} = $pos; $methylation_call_params->{$sequence_identifier}->{indels} = $indels; } ### METHYLATION CALL sub methylation_call{ my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_; ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one my @seq = split(//,$sequence_actually_observed); my @genomic = split(//,$genomic_sequence); # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n"; ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either ### CpG, CHH or CHG context ################################################################# ### . for bases not involving cytosines ### ### X for methylated C in CHG context (was protected) ### ### x for not methylated C in CHG context (was converted) ### ### H for methylated C in CHH context (was protected) ### ### h for not methylated C in CHH context (was converted) ### ### Z for methylated C in CpG context (was protected) ### ### z for not methylated C in CpG context (was converted) ### ### U for methylated C in unknown context (was protected) ### ### u for not methylated C in unknwon context (was converted) ### ################################################################# my @match =(); warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 my $methyl_CHH_count = 0; my $methyl_CHG_count = 0; my $methyl_CpG_count = 0; my $methyl_C_unknown_count = 0; my $unmethylated_CHH_count = 0; my $unmethylated_CHG_count = 0; my $unmethylated_CpG_count = 0; my $unmethylated_C_unknown_count = 0; if ($read_conversion eq 'CT'){ for my $index (0..$#seq) { if ($seq[$index] eq $genomic[$index]) { ### The residue can only be a C if it was not converted to T, i.e. protected my methylation if ($genomic[$index] eq 'C') { ### If the residue is a C we want to know if it was in CpG context or in any other context my $downstream_base = $genomic[$index+1]; if ($downstream_base eq 'G'){ ++$methyl_CpG_count; push @match,'Z'; # protected C, methylated, in CpG context } elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) ++$methyl_C_unknown_count; push @match,'U'; # protected C, methylated, in Unknown context } else { ### C in not in CpG-context, determining the second downstream base context my $second_downstream_base = $genomic[$index+2]; if ($second_downstream_base eq 'G'){ ++$methyl_CHG_count; push @match,'X'; # protected C, methylated, in CHG context } elsif ($second_downstream_base eq 'N'){ ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) push @match,'U'; # protected C, methylated, in Unknown context } else{ ++$methyl_CHH_count; push @match,'H'; # protected C, methylated, in CHH context } } } else { push @match, '.'; } } elsif ($seq[$index] ne $genomic[$index]) { ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts ### in the actually observed sequence if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') { ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context my $downstream_base = $genomic[$index+1]; if ($downstream_base eq 'G'){ ++$unmethylated_CpG_count; push @match,'z'; # converted C, not methylated, in CpG context } elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) ++$unmethylated_C_unknown_count; push @match,'u'; # converted C, not methylated, in Unknown context } else{ ### C in not in CpG-context, determining the second downstream base context my $second_downstream_base = $genomic[$index+2]; if ($second_downstream_base eq 'G'){ ++$unmethylated_CHG_count; push @match,'x'; # converted C, not methylated, in CHG context } elsif ($second_downstream_base eq 'N'){ ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) push @match,'u'; # converted C, not methylated, in Unknown context } else{ ++$unmethylated_CHH_count; push @match,'h'; # converted C, not methylated, in CHH context } } } ### all other mismatches are not of interest for a methylation call else { push @match,'.'; } } else{ die "There can be only 2 possibilities\n"; } } } elsif ($read_conversion eq 'GA'){ # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n"; for my $index (0..$#seq) { if ($seq[$index] eq $genomic[$index+2]) { ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation if ($genomic[$index+2] eq 'G') { ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need ### to look if the base upstream is a C my $upstream_base = $genomic[$index+1]; if ($upstream_base eq 'C'){ ++$methyl_CpG_count; push @match,'Z'; # protected C on opposing strand, methylated, in CpG context } elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) ++$methyl_C_unknown_count; push @match,'U'; # protected C on opposing strand, methylated, in Unknown context } else{ ### C in not in CpG-context, determining the second upstream base context my $second_upstream_base = $genomic[$index]; if ($second_upstream_base eq 'C'){ ++$methyl_CHG_count; push @match,'X'; # protected C on opposing strand, methylated, in CHG context } elsif ($second_upstream_base eq 'N'){ ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) push @match,'U'; # protected C, methylated, in Unknown context } else{ ++$methyl_CHH_count; push @match,'H'; # protected C on opposing strand, methylated, in CHH context } } } else{ push @match, '.'; } } elsif ($seq[$index] ne $genomic[$index+2]) { ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts ### on the opposing strand, so G to A conversions in the actually observed sequence if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') { ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream! my $upstream_base = $genomic[$index+1]; if ($upstream_base eq 'C'){ ++$unmethylated_CpG_count; push @match,'z'; # converted C on opposing strand, not methylated, in CpG context } elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) ++$unmethylated_C_unknown_count; push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context } else{ ### C in not in CpG-context, determining the second upstream base context my $second_upstream_base = $genomic[$index]; if ($second_upstream_base eq 'C'){ ++$unmethylated_CHG_count; push @match,'x'; # converted C on opposing strand, not methylated, in CHG context } elsif ($second_upstream_base eq 'N'){ ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context } else{ ++$unmethylated_CHH_count; push @match,'h'; # converted C on opposing strand, not methylated, in CHH context } } } ### all other mismatches are not of interest for a methylation call else { push @match,'.'; } } else{ die "There can be only 2 possibilities\n"; } } } else{ die "Strand conversion info is required to perform a methylation call\n"; } my $methylation_call = join ("",@match); $counting{total_meCHH_count} += $methyl_CHH_count; $counting{total_meCHG_count} += $methyl_CHG_count; $counting{total_meCpG_count} += $methyl_CpG_count; $counting{total_meC_unknown_count} += $methyl_C_unknown_count; $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; return $methylation_call; } sub read_genome_into_memory{ ## working directoy my $cwd = shift; ## reading in and storing the specified genome in the %chromosomes hash chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; my @chromosome_filenames = <*.fa>; ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta unless (@chromosome_filenames){ @chromosome_filenames = <*.fasta>; } unless (@chromosome_filenames){ die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; } foreach my $chromosome_filename (@chromosome_filenames){ open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; ### first line needs to be a fastA header my $first_line = <CHR_IN>; chomp $first_line; $first_line =~ s/\r//; ### Extracting chromosome name from the FastA header my $chromosome_name = extract_chromosome_name($first_line); my $sequence; while (<CHR_IN>){ chomp; $_ =~ s/\r//; if ($_ =~ /^>/){ ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) if (exists $chromosomes{$chromosome_name}){ print "chr $chromosome_name (",length $sequence ," bp)\n"; die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; } else { if (length($sequence) == 0){ warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; } print "chr $chromosome_name (",length $sequence ," bp)\n"; $chromosomes{$chromosome_name} = $sequence; } ### resetting the sequence variable $sequence = ''; ### setting new chromosome name $chromosome_name = extract_chromosome_name($_); } else{ $sequence .= uc$_; } } if (exists $chromosomes{$chromosome_name}){ print "chr $chromosome_name (",length $sequence ," bp)\t"; die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; } else{ if (length($sequence) == 0){ warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; } print "chr $chromosome_name (",length $sequence ," bp)\n"; $chromosomes{$chromosome_name} = $sequence; } } print "\n"; chdir $cwd or die "Failed to move to directory $cwd\n"; } sub extract_chromosome_name { ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well my $fasta_header = shift; if ($fasta_header =~ s/^>//){ my ($chromosome_name) = split (/\s+/,$fasta_header); return $chromosome_name; } else{ die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; } } sub reverse_complement{ my $sequence = shift; $sequence =~ tr/CATG/GTAC/; $sequence = reverse($sequence); return $sequence; } sub biTransformFastAFiles { my $file = shift; my ($dir,$filename); if ($file =~ /\//){ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; } else{ $filename = $file; } ### gzipped version of the infile if ($file =~ /\.gz$/){ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; } else{ open (IN,$file) or die "Couldn't read from file $file: $!\n"; } if ($skip){ warn "Skipping the first $skip reads from $file\n"; sleep (1); } if ($upto){ warn "Processing reads up to sequence no. $upto from $file\n"; sleep (1); } my $C_to_T_infile = my $G_to_A_infile = $filename; if ($gzip){ $C_to_T_infile =~ s/$/_C_to_T.fa.gz/; $G_to_A_infile =~ s/$/_G_to_A.fa.gz/; } else{ $C_to_T_infile =~ s/$/_C_to_T.fa/; $G_to_A_infile =~ s/$/_G_to_A.fa/; } if ($prefix){ # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; $C_to_T_infile = "$prefix.$C_to_T_infile"; $G_to_A_infile = "$prefix.$G_to_A_infile"; # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; } warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; if ($gzip){ open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; } else{ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; } unless ($directional){ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; if ($gzip){ open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; } else{ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; } } my $count = 0; while (1){ my $header = <IN>; my $sequence= <IN>; last unless ($header and $sequence); $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces ++$count; if ($skip){ next unless ($count > $skip); } if ($upto){ last if ($count > $upto); } $sequence = uc$sequence; # make input file case insensitive # detecting if the input file contains tab stops, as this is likely to result in no alignments if (index($header,"\t") != -1){ $seqID_contains_tabs++; } ### small check if the sequence seems to be in FastA format die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); my $sequence_C_to_T = $sequence; $sequence_C_to_T =~ tr/C/T/; print CTOT "$header$sequence_C_to_T"; unless ($directional){ my $sequence_G_to_A = $sequence; $sequence_G_to_A =~ tr/G/A/; print GTOA "$header$sequence_G_to_A"; } } close CTOT or die "Failed to close filehandle $!\n"; if ($directional){ warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n"; } else{ close GTOA or die "Failed to close filehandle $!\n"; warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; } return ($C_to_T_infile,$G_to_A_infile); } sub biTransformFastAFiles_paired_end { my ($file,$read_number) = @_; if ($gzip){ warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n"; sleep (2); } my ($dir,$filename); if ($file =~ /\//){ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; } else{ $filename = $file; } ### gzipped version of the infile if ($file =~ /\.gz$/){ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; } else{ open (IN,$file) or die "Couldn't read from file $file: $!\n"; } if ($skip){ warn "Skipping the first $skip reads from $file\n"; sleep (1); } if ($upto){ warn "Processing reads up to sequence no. $upto from $file\n"; sleep (1); } my $C_to_T_infile = my $G_to_A_infile = $filename; $C_to_T_infile =~ s/$/_C_to_T.fa/; $G_to_A_infile =~ s/$/_G_to_A.fa/; if ($prefix){ # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; $C_to_T_infile = "$prefix.$C_to_T_infile"; $G_to_A_infile = "$prefix.$G_to_A_infile"; # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; } if ($directional){ if ($read_number == 1){ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; } elsif ($read_number == 2){ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; } else{ die "Read number needs to be 1 or 2, but was: $read_number\n\n"; } } else{ # all four strand output warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; } my $count = 0; while (1){ my $header = <IN>; my $sequence= <IN>; last unless ($header and $sequence); $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces ++$count; if ($skip){ next unless ($count > $skip); } if ($upto){ last if ($count > $upto); } $sequence = uc$sequence; # make input file case insensitive # detecting if the input file contains tab stops, as this is likely to result in no alignments if (index($header,"\t") != -1){ $seqID_contains_tabs++; } ## small check if the sequence seems to be in FastA format die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/); if ($read_number == 1){ if ($bowtie2){ $header =~ s/$/\/1\/1/; } else{ $header =~ s/$/\/1/; } } elsif ($read_number == 2){ if ($bowtie2){ $header =~ s/$/\/2\/2/; } else{ $header =~ s/$/\/2/; } } else{ die "Read number needs to be 1 or 2, but was: $read_number\n\n"; } my $sequence_C_to_T = my $sequence_G_to_A = $sequence; $sequence_C_to_T =~ tr/C/T/; $sequence_G_to_A =~ tr/G/A/; if ($directional){ if ($read_number == 1){ print CTOT "$header$sequence_C_to_T"; } elsif ($read_number == 2){ print GTOA "$header$sequence_G_to_A"; } } else{ print CTOT "$header$sequence_C_to_T"; print GTOA "$header$sequence_G_to_A"; } } if ($directional){ if ($read_number == 1){ warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n"; } else{ warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n"; } } else{ warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; } if ($directional){ if ($read_number == 1){ return ($C_to_T_infile); } else{ return ($G_to_A_infile); } } else{ return ($C_to_T_infile,$G_to_A_infile); } } sub biTransformFastQFiles { my $file = shift; my ($dir,$filename); if ($file =~ /\//){ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; } else{ $filename = $file; } ### gzipped version of the infile if ($file =~ /\.gz$/){ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; } else{ open (IN,$file) or die "Couldn't read from file $file: $!\n"; } if ($skip){ warn "Skipping the first $skip reads from $file\n"; sleep (1); } if ($upto){ warn "Processing reads up to sequence no. $upto from $file\n"; sleep (1); } my $C_to_T_infile = my $G_to_A_infile = $filename; if ($prefix){ # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; $C_to_T_infile = "$prefix.$C_to_T_infile"; $G_to_A_infile = "$prefix.$G_to_A_infile"; # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; } if ($pbat){ # PBAT-Seq if ($gzip){ $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; } else{ $G_to_A_infile =~ s/$/_G_to_A.fastq/; } warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; if ($gzip){ open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; } else{ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; } } else{ # directional or non-directional if ($gzip){ $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; } else{ $C_to_T_infile =~ s/$/_C_to_T.fastq/; } warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; if ($gzip){ open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; } else{ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option } unless ($directional){ if ($gzip){ $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; } else{ $G_to_A_infile =~ s/$/_G_to_A.fastq/; } warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; if ($gzip){ open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; } else{ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; } } } my $count = 0; while (1){ my $identifier = <IN>; my $sequence = <IN>; my $identifier2 = <IN>; my $quality_score = <IN>; last unless ($identifier and $sequence and $identifier2 and $quality_score); $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces ++$count; if ($skip){ next unless ($count > $skip); } if ($upto){ last if ($count > $upto); } $sequence = uc$sequence; # make input file case insensitive # detecting if the input file contains tab stops, as this is likely to result in no alignments if (index($identifier,"\t") != -1){ $seqID_contains_tabs++; } ## small check if the sequence file appears to be a FastQ file if ($count == 1){ if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; } } if ($pbat){ my $sequence_G_to_A = $sequence; $sequence_G_to_A =~ tr/G/A/; print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); } else{ # directional or non-directional my $sequence_C_to_T = $sequence; $sequence_C_to_T =~ tr/C/T/; print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); unless ($directional){ my $sequence_G_to_A = $sequence; $sequence_G_to_A =~ tr/G/A/; print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); } } } if ($directional){ close CTOT or die "Failed to close filehandle $!\n"; warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; } elsif($pbat){ warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; close GTOA or die "Failed to close filehandle $!\n"; return ($G_to_A_infile); } else{ close CTOT or die "Failed to close filehandle $!\n"; close GTOA or die "Failed to close filehandle $!\n"; warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; } return ($C_to_T_infile,$G_to_A_infile); } sub biTransformFastQFiles_paired_end { my ($file,$read_number) = @_; my ($dir,$filename); if ($file =~ /\//){ ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; } else{ $filename = $file; } ### gzipped version of the infile if ($file =~ /\.gz$/){ open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; } else{ open (IN,$file) or die "Couldn't read from file $file: $!\n"; } if ($skip){ warn "Skipping the first $skip reads from $file\n"; sleep (1); } if ($upto){ warn "Processing reads up to sequence no. $upto from $file\n"; sleep (1); } my $C_to_T_infile = my $G_to_A_infile = $filename; if ($gzip){ $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; } else{ $C_to_T_infile =~ s/$/_C_to_T.fastq/; $G_to_A_infile =~ s/$/_G_to_A.fastq/; } if ($prefix){ # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; $C_to_T_infile = "$prefix.$C_to_T_infile"; $G_to_A_infile = "$prefix.$G_to_A_infile"; # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; } if ($directional){ if ($read_number == 1){ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; if ($gzip){ open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; } else{ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; } } elsif ($read_number == 2){ warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; if ($gzip){ open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; } else{ open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; } } else{ die "Read number needs to be 1 or 2, but was $read_number!\n\n"; } } else{ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; if ($gzip){ open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; } else{ open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; } } my $count = 0; while (1){ my $identifier = <IN>; my $sequence = <IN>; my $identifier2 = <IN>; my $quality_score = <IN>; last unless ($identifier and $sequence and $identifier2 and $quality_score); ++$count; $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces if ($skip){ next unless ($count > $skip); } if ($upto){ last if ($count > $upto); } $sequence= uc$sequence; # make input file case insensitive ## small check if the sequence file appears to be a FastQ file if ($count == 1){ if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; } } my $sequence_C_to_T = my $sequence_G_to_A = $sequence; if ($read_number == 1){ if ($bowtie2){ $identifier =~ s/$/\/1\/1/; } else{ $identifier =~ s/$/\/1/; } } elsif ($read_number == 2){ if ($bowtie2){ $identifier =~ s/$/\/2\/2/; } else{ $identifier =~ s/$/\/2/; } } else{ die "Read number needs to be 1 or 2\n"; } $sequence_C_to_T =~ tr/C/T/; $sequence_G_to_A =~ tr/G/A/; if ($directional){ if ($read_number == 1){ print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); } else{ print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); } } else{ print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); } } if ($directional){ if ($read_number == 1){ warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; } else{ warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; } } else{ warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; } if ($directional){ if ($read_number == 1){ close CTOT or die "Failed to close filehandle $!\n"; return ($C_to_T_infile); } else{ close GTOA or die "Failed to close filehandle $!\n"; return ($G_to_A_infile); } } else{ close CTOT or die "Failed to close filehandle $!\n"; close GTOA or die "Failed to close filehandle $!\n"; return ($C_to_T_infile,$G_to_A_infile); } } ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES sub biTransformFastQFiles_paired_end_bowtie1_gzip { my ($file_1,$file_2) = @_; my ($dir,$filename); if ($file_1 =~ /\//){ ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/; } else{ $filename = $file_1; } ### gzipped version of infile 1 if ($file_1 =~ /\.gz$/){ open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n"; } else{ open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n"; } ### gzipped version of infile 2 if ($file_2 =~ /\.gz$/){ open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n"; } else{ open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n"; } if ($skip){ warn "Skipping the first $skip reads from $file_1 and $file_2\n"; sleep (1); } if ($upto){ warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n"; sleep (1); } my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; if ($prefix){ # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; } $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; unless ($directional){ print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n"; open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n"; } ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format: ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate> my $count = 0; while (1){ my $identifier_1 = <IN_1>; my $sequence_1 = <IN_1>; my $identifier2_1 = <IN_1>; my $quality_score_1 = <IN_1>; my $identifier_2 = <IN_2>; my $sequence_2 = <IN_2>; my $identifier2_2 = <IN_2>; my $quality_score_2 = <IN_2>; last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2); ++$count; ## small check if the sequence file appears to be a FastQ file if ($count == 1){ if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){ die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n"; } if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){ die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n"; } } $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces chomp $identifier_1; chomp $sequence_1; chomp $sequence_2; chomp $quality_score_1; chomp $quality_score_2; $identifier_1 =~ s/^\@//; $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever) if ($skip){ next unless ($count > $skip); } if ($upto){ last if ($count > $upto); } $sequence_1 = uc$sequence_1; # make input file 1 case insensitive $sequence_2 = uc$sequence_2; # make input file 2 case insensitive # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n"; my $sequence_1_C_to_T = $sequence_1; my $sequence_2_G_to_A = $sequence_2; $sequence_1_C_to_T =~ tr/C/T/; $sequence_2_G_to_A =~ tr/G/A/; print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n"; unless ($directional){ my $sequence_1_G_to_A = $sequence_1; my $sequence_2_C_to_T = $sequence_2; $sequence_1_G_to_A =~ tr/G/A/; $sequence_2_C_to_T =~ tr/C/T/; print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n"; } } close CTPLUSGA or die "Couldn't close filehandle\n"; warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n"; if ($directional){ warn "\n"; return ($CT_plus_GA_infile); } else{ close GAPLUSCT or die "Couldn't close filehandle\n"; warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n"; return ($CT_plus_GA_infile,$GA_plus_CT_infile); } } sub fix_IDs{ my $id = shift; $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores return $id; } sub ensure_sensical_alignment_orientation_single_end{ my $index = shift; # index number if the sequence produced an alignment my $strand = shift; ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one my $orientation = 0; ############################################################################################################## ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T) ## here we only want reads in the forward (+) orientation if ($fhs[$index]->{name} eq 'CTreadCTgenome') { ### if the alignment is (+) we count it, and return 1 for a correct orientation if ($strand eq '+') { $fhs[$index]->{seen}++; $orientation = 1; return $orientation; } ### if the orientation equals (-) the alignment is nonsensical elsif ($strand eq '-') { $fhs[$index]->{wrong_strand}++; return $orientation; } } ############################################################################################################### ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A) ## here we only want reads in the forward (-) orientation elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') { ### if the alignment is (-) we count it and return 1 for a correct orientation if ($strand eq '-') { $fhs[$index]->{seen}++; $orientation = 1; return $orientation; } ### if the orientation equals (+) the alignment is nonsensical elsif ($strand eq '+') { $fhs[$index]->{wrong_strand}++; return $orientation; } } ############################################################################################################### ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T) ## here we only want reads in the forward (-) orientation elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') { ### if the alignment is (-) we count it and return 1 for a correct orientation if ($strand eq '-') { $fhs[$index]->{seen}++; $orientation = 1; return $orientation; } ### if the orientation equals (+) the alignment is nonsensical elsif ($strand eq '+') { $fhs[$index]->{wrong_strand}++; return $orientation; } } ############################################################################################################### ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A) ## here we only want reads in the forward (+) orientation elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') { ### if the alignment is (+) we count it and return 1 for a correct orientation if ($strand eq '+') { $fhs[$index]->{seen}++; $orientation = 1; return $orientation; } ### if the orientation equals (-) the alignment is nonsensical elsif ($strand eq '-') { $fhs[$index]->{wrong_strand}++; return $orientation; } } else{ die "One of the above conditions must be true\n"; } } sub ensure_sensical_alignment_orientation_paired_ends{ my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one my $orientation = 0; ############################################################################################################## ## [Index 0, sequence originated from (converted) forward strand] ## CT converted read 1 ## GA converted read 2 ## CT converted genome ## here we only want read 1 in (+) orientation and read 2 in (-) orientation if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') { ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { $fhs[$index]->{seen}++; $orientation = 1; return $orientation; } ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { $fhs[$index]->{wrong_strand}++; return $orientation; } else{ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; } } ############################################################################################################### ## [Index 1, sequence originated from (converted) reverse strand] ## GA converted read 1 ## CT converted read 2 ## GA converted genome ## here we only want read 1 in (+) orientation and read 2 in (-) orientation elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') { ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { $fhs[$index]->{seen}++; $orientation = 1; return $orientation; } ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { $fhs[$index]->{wrong_strand}++; return $orientation; } else{ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; } } ############################################################################################################### ## [Index 2, sequence originated from complementary to (converted) forward strand] ## GA converted read 1 ## CT converted read 2 ## CT converted genome ## here we only want read 1 in (-) orientation and read 2 in (+) orientation elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') { ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { $fhs[$index]->{seen}++; $orientation = 1; return $orientation; } ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { $fhs[$index]->{wrong_strand}++; return $orientation; } else{ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; } } ############################################################################################################### ## [Index 3, sequence originated from complementary to (converted) reverse strand] ## CT converted read 1 ## GA converted read 2 ## GA converted genome ## here we only want read 1 in (+) orientation and read 2 in (-) orientation elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') { ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { $fhs[$index]->{seen}++; $orientation = 1; return $orientation; } ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { $fhs[$index]->{wrong_strand}++; return $orientation; } else{ die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; } } else{ die "One of the above conditions must be true\n"; } } ##################################################################################################################################################### ### Bowtie 1 (default) | PAIRED-END | FASTA sub paired_end_align_fragments_to_bisulfite_genome_fastA { my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; if ($directional){ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; } else{ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; } ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the ## data structure above if ($directional){ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } else{ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } foreach my $fh (@fhs) { if ($directional){ unless ($fh->{inputfile_1}){ $fh->{last_seq_id} = undef; $fh->{last_line_1} = undef; $fh->{last_line_2} = undef; next; } } my $bt_options = $bowtie_options; if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner } else { $bt_options .= ' --nofw'; } warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n"; open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; my $line_1 = $fh->{fh}->getline(); my $line_2 = $fh->{fh}->getline(); # if Bowtie produces an alignment we store the first line of the output if ($line_1 and $line_2) { chomp $line_1; chomp $line_2; my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. ### We will thus identify which sequence was read 1 and store this ID as last_seq_id if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present $fh->{last_seq_id} = $id_1; } elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present $fh->{last_seq_id} = $id_2; } else{ die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; } $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; } # otherwise we just initialise last_seq_id and last_lines as undefined else { warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; $fh->{last_seq_id} = undef; $fh->{last_line_1} = undef; $fh->{last_line_2} = undef; } } } ### Bowtie 2 | PAIRED-END | FASTA sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; if ($directional){ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; } else{ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; } ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the ## data structure above if ($directional){ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } else{ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } foreach my $fh (@fhs) { if ($directional){ unless ($fh->{inputfile_1}){ $fh->{last_seq_id} = undef; $fh->{last_line_1} = undef; $fh->{last_line_2} = undef; next; } } my $bt2_options = $bowtie_options; if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner } else { $bt2_options .= ' --nofw'; } warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence while (1){ $_ = $fh->{fh}->getline(); if ($_) { last unless ($_ =~ /^\@/); # SAM headers start with @ } else{ last; # no alignment output } } my $line_1 = $_; my $line_2 = $fh->{fh}->getline(); # if Bowtie produces an alignment we store the first line of the output if ($line_1 and $line_2) { chomp $line_1; chomp $line_2; my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. ### We will thus identify which sequence was read 1 and store this ID as last_seq_id if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with $fh->{last_seq_id} = $id_1; } elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present $fh->{last_seq_id} = $id_2; } else{ warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; } $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; } # otherwise we just initialise last_seq_id and last_lines as undefined else { warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; $fh->{last_seq_id} = undef; $fh->{last_line_1} = undef; $fh->{last_line_2} = undef; } } } ### Bowtie 1 (default) | PAIRED-END | FASTQ sub paired_end_align_fragments_to_bisulfite_genome_fastQ { my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; if ($directional){ warn "Input file is $C_to_T_infile_1 (FastQ)\n"; } elsif($pbat){ warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n"; } else{ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n"; } ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the ## data structure above if ($directional or $pbat){ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } else{ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } foreach my $fh (@fhs) { if ($directional or $pbat){ unless ($fh->{inputfile_1}){ $fh->{last_seq_id} = undef; $fh->{last_line_1} = undef; $fh->{last_line_2} = undef; next; # skipping unwanted filehandles } } my $bt_options = $bowtie_options; if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner } else { $bt_options .= ' --nofw'; } if ($gzip){ warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n"; open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!"; } else{ warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n"; sleep(5); open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; } my $line_1 = $fh->{fh}->getline(); my $line_2 = $fh->{fh}->getline(); # if Bowtie produces an alignment we store the first line of the output if ($line_1 and $line_2) { chomp $line_1; chomp $line_2; ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. ### We will thus identify which sequence was read 1 and store this ID as last_seq_id my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present $fh->{last_seq_id} = $id_1; } elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present $fh->{last_seq_id} = $id_2; } else{ die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; } $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; } # otherwise we just initialise last_seq_id and last_lines as undefined else { warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; $fh->{last_seq_id} = undef; $fh->{last_line_1} = undef; $fh->{last_line_2} = undef; } } } ### Bowtie 2 | PAIRED-END | FASTQ sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; if ($directional){ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; } else{ warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; } ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the ## data structure above if ($directional){ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } else{ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } foreach my $fh (@fhs) { if ($directional){ unless ($fh->{inputfile_1}){ $fh->{last_seq_id} = undef; $fh->{last_line_1} = undef; $fh->{last_line_2} = undef; next; } } my $bt2_options = $bowtie_options; if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner } else { $bt2_options .= ' --nofw'; } warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence while (1){ $_ = $fh->{fh}->getline(); if ($_) { last unless ($_ =~ /^\@/); # SAM headers start with @ } else{ last; # no alignment output } } my $line_1 = $_; my $line_2 = $fh->{fh}->getline(); # if Bowtie produces an alignment we store the first line of the output if ($line_1 and $line_2) { chomp $line_1; chomp $line_2; ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. ### We will thus identify which sequence was read 1 and store this ID as last_seq_id my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with $fh->{last_seq_id} = $id_1; } elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present $fh->{last_seq_id} = $id_2; } else{ die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; } $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; } # otherwise we just initialise last_seq_id and last_lines as undefined else { warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; $fh->{last_seq_id} = undef; $fh->{last_line_1} = undef; $fh->{last_line_2} = undef; } } } ##################################################################################################################################################### ### Bowtie 1 (default) | SINGLE-END | FASTA sub single_end_align_fragments_to_bisulfite_genome_fastA { my ($C_to_T_infile,$G_to_A_infile) = @_; if ($directional){ warn "Input file is $C_to_T_infile (FastA)\n"; } else{ warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; } ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in ## data structure above if ($directional){ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } else{ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } foreach my $fh (@fhs) { my $bt_options = $bowtie_options; if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner } else { $bt_options .= ' --nofw'; } warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; if ($gzip){ open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; } else{ open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data } # if Bowtie produces an alignment we store the first line of the output $_ = $fh->{fh}->getline(); if ($_) { chomp; my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier) $fh->{last_seq_id} = $id; $fh->{last_line} = $_; warn "Found first alignment:\t$fh->{last_line}\n"; } # otherwise we just initialise last_seq_id and last_line as undefined else { warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; $fh->{last_seq_id} = undef; $fh->{last_line} = undef; } } } ### Bowtie 2 | SINGLE-END | FASTA sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { my ($C_to_T_infile,$G_to_A_infile) = @_; if ($directional){ warn "Input file is $C_to_T_infile (FastA)\n"; } else{ warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; } ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in ## data structure above if ($directional){ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } else{ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } foreach my $fh (@fhs) { my $bt2_options = $bowtie_options; if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner } else { $bt2_options .= ' --nofw'; } warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n"; open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence while (1){ $_ = $fh->{fh}->getline(); if ($_) { last unless ($_ =~ /^\@/); # SAM headers start with @ } else{ last; # no alignment output } } # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output if ($_) { chomp; my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) $fh->{last_seq_id} = $id; $fh->{last_line} = $_; warn "Found first alignment:\t$fh->{last_line}\n"; } # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output else { warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; $fh->{last_seq_id} = undef; $fh->{last_line} = undef; } } } ### Bowtie 1 (default) | SINGLE-END | FASTQ sub single_end_align_fragments_to_bisulfite_genome_fastQ { my ($C_to_T_infile,$G_to_A_infile) = @_; if ($directional){ warn "Input file is $C_to_T_infile (FastQ)\n"; } elsif($pbat){ warn "Input file is $G_to_A_infile (FastQ)\n"; } else{ warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n"; } ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in ## the data structure above if ($directional or $pbat){ warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } else{ warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } foreach my $fh (@fhs) { my $bt_options = $bowtie_options; if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner } else { $bt_options .= ' --nofw'; } warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; sleep (5); if ($gzip){ open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; } else{ open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data } # if Bowtie produces an alignment we store the first line of the output $_ = $fh->{fh}->getline(); if ($_) { chomp; my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) $fh->{last_seq_id} = $id; $fh->{last_line} = $_; warn "Found first alignment:\t$fh->{last_line}\n"; } # otherwise we just initialise last_seq_id and last_line as undefined else { warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; $fh->{last_seq_id} = undef; $fh->{last_line} = undef; } } } ### Bowtie 2 | SINGLE-END | FASTQ sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { my ($C_to_T_infile,$G_to_A_infile) = @_; if ($directional){ warn "Input file is $C_to_T_infile (FastQ)\n\n"; } else{ warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n"; } ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in ## the data structure above if ($directional){ warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } else{ warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; } foreach my $fh (@fhs) { my $bt2_options = $bowtie_options; if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner } else { $bt2_options .= ' --nofw'; } warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n"; warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n"; open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence while (1){ $_ = $fh->{fh}->getline(); # warn "$_\n"; # sleep(1); if ($_) { last unless ($_ =~ /^\@/); # SAM headers start with @ } else { last; } } # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output if ($_) { chomp; my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier) $fh->{last_seq_id} = $id; $fh->{last_line} = $_; warn "Found first alignment:\t$fh->{last_line}\n"; # warn "storing $id and\n$_\n"; } # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output else { warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; $fh->{last_seq_id} = undef; $fh->{last_line} = undef; } } } ########################################################################################################################################### sub reset_counters_and_fhs{ my $filename = shift; %counting=( total_meCHH_count => 0, total_meCHG_count => 0, total_meCpG_count => 0, total_meC_unknown_count => 0, total_unmethylated_CHH_count => 0, total_unmethylated_CHG_count => 0, total_unmethylated_CpG_count => 0, total_unmethylated_C_unknown_count => 0, sequences_count => 0, no_single_alignment_found => 0, unsuitable_sequence_count => 0, genomic_sequence_could_not_be_extracted_count => 0, unique_best_alignment_count => 0, low_complexity_alignments_overruled_count => 0, CT_CT_count => 0, #(CT read/CT genome, original top strand) CT_GA_count => 0, #(CT read/GA genome, original bottom strand) GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand) GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand) CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand) GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand) GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand) CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand) alignments_rejected_count => 0, # only relevant if --directional was specified ); if ($directional){ if ($filename =~ ','){ # paired-end files @fhs=( { name => 'CTreadCTgenome', strand_identity => 'con ori forward', bisulfiteIndex => $CT_index_basename, seen => 0, wrong_strand => 0, }, { name => 'CTreadGAgenome', strand_identity => 'con ori reverse', bisulfiteIndex => $GA_index_basename, seen => 0, wrong_strand => 0, }, { name => 'GAreadCTgenome', strand_identity => 'compl ori con forward', bisulfiteIndex => $CT_index_basename, seen => 0, wrong_strand => 0, }, { name => 'GAreadGAgenome', strand_identity => 'compl ori con reverse', bisulfiteIndex => $GA_index_basename, seen => 0, wrong_strand => 0, }, ); } else{ # single-end files @fhs=( { name => 'CTreadCTgenome', strand_identity => 'con ori forward', bisulfiteIndex => $CT_index_basename, seen => 0, wrong_strand => 0, }, { name => 'CTreadGAgenome', strand_identity => 'con ori reverse', bisulfiteIndex => $GA_index_basename, seen => 0, wrong_strand => 0, }, ); } } elsif($pbat){ if ($filename =~ ','){ # paired-end files @fhs=( { name => 'CTreadCTgenome', strand_identity => 'con ori forward', bisulfiteIndex => $CT_index_basename, seen => 0, wrong_strand => 0, }, { name => 'CTreadGAgenome', strand_identity => 'con ori reverse', bisulfiteIndex => $GA_index_basename, seen => 0, wrong_strand => 0, }, { name => 'GAreadCTgenome', strand_identity => 'compl ori con forward', bisulfiteIndex => $CT_index_basename, seen => 0, wrong_strand => 0, }, { name => 'GAreadGAgenome', strand_identity => 'compl ori con reverse', bisulfiteIndex => $GA_index_basename, seen => 0, wrong_strand => 0, }, ); } else{ # single-end files @fhs=( { name => 'GAreadCTgenome', strand_identity => 'compl ori con forward', bisulfiteIndex => $CT_index_basename, seen => 0, wrong_strand => 0, }, { name => 'GAreadGAgenome', strand_identity => 'compl ori con reverse', bisulfiteIndex => $GA_index_basename, seen => 0, wrong_strand => 0, }, ); } } else{ @fhs=( { name => 'CTreadCTgenome', strand_identity => 'con ori forward', bisulfiteIndex => $CT_index_basename, seen => 0, wrong_strand => 0, }, { name => 'CTreadGAgenome', strand_identity => 'con ori reverse', bisulfiteIndex => $GA_index_basename, seen => 0, wrong_strand => 0, }, { name => 'GAreadCTgenome', strand_identity => 'compl ori con forward', bisulfiteIndex => $CT_index_basename, seen => 0, wrong_strand => 0, }, { name => 'GAreadGAgenome', strand_identity => 'compl ori con reverse', bisulfiteIndex => $GA_index_basename, seen => 0, wrong_strand => 0, }, ); } } sub process_command_line{ my @bowtie_options; my $help; my $mates1; my $mates2; my $path_to_bowtie; my $fastq; my $fasta; my $skip; my $qupto; my $phred64; my $phred33; my $solexa; my $mismatches; my $seed_length; my $best; my $sequence_format; my $version; my $quiet; my $chunk; my $non_directional; my $ceiling; my $maxins; my $minins; my $unmapped; my $multi_map; my $output_dir; my $bowtie2; my $vanilla; my $sam_no_hd; my $seed_extension_fails; my $reseed_repetitive_seeds; my $most_valid_alignments; my $score_min; my $parallel; my $temp_dir; my $rdg; my $rfg; my $non_bs_mm; my $samtools_path; my $bam; my $gzip; my $pbat; my $prefix; my $old_flag; my $command_line = GetOptions ('help|man' => \$help, '1=s' => \$mates1, '2=s' => \$mates2, 'path_to_bowtie=s' => \$path_to_bowtie, 'f|fasta' => \$fasta, 'q|fastq' => \$fastq, 's|skip=i' => \$skip, 'u|upto=i' => \$qupto, 'phred33-quals' => \$phred33, 'phred64-quals|solexa1' => \$phred64, 'solexa-quals' => \$solexa, 'n|seedmms=i' => \$mismatches, 'l|seedlen=i' => \$seed_length, 'no_best' => \$best, 'version' => \$version, 'quiet' => \$quiet, 'chunkmbs=i' => \$chunk, 'non_directional' => \$non_directional, 'I|minins=i' => \$minins, 'X|maxins=i' => \$maxins, 'e|maqerr=i' => \$ceiling, 'un|unmapped' => \$unmapped, 'ambiguous' => \$multi_map, 'o|output_dir=s' => \$output_dir, 'bowtie2' => \$bowtie2, 'vanilla' => \$vanilla, 'sam-no-hd' => \$sam_no_hd, 'D=i' => \$seed_extension_fails, 'R=i' => \$reseed_repetitive_seeds, 'score_min=s' => \$score_min, 'most_valid_alignments=i' => \$most_valid_alignments, 'p=i' => \$parallel, 'temp_dir=s' => \$temp_dir, 'rdg=s' => \$rdg, 'rfg=s' => \$rfg, 'non_bs_mm' => \$non_bs_mm, 'samtools_path=s' => \$samtools_path, 'bam' => \$bam, 'gzip' => \$gzip, 'pbat' => \$pbat, 'prefix=s' => \$prefix, 'old_flag' => \$old_flag, ); ### EXIT ON ERROR if there were errors with any of the supplied options unless ($command_line){ die "Please respecify command line options\n"; } ### HELPFILE if ($help){ print_helpfile(); exit; } if ($version){ print << "VERSION"; Bismark - Bisulfite Mapper and Methylation Caller. Bismark Version: $bismark_version Copyright 2010-13 Felix Krueger, Babraham Bioinformatics www.bioinformatics.babraham.ac.uk/projects/ VERSION exit; } ########################## ### PROCESSING OPTIONS ### ########################## unless ($bowtie2){ $bowtie2 = 0; } unless ($sam_no_hd){ $sam_no_hd =0; } ### PATH TO BOWTIE ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH if ($path_to_bowtie){ unless ($path_to_bowtie =~ /\/$/){ $path_to_bowtie =~ s/$/\//; } if (-d $path_to_bowtie){ if ($bowtie2){ $path_to_bowtie = "${path_to_bowtie}bowtie2"; } else{ $path_to_bowtie = "${path_to_bowtie}bowtie"; } } else{ die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n"; } } else{ if ($bowtie2){ $path_to_bowtie = 'bowtie2'; warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; } else{ $path_to_bowtie = 'bowtie'; warn "Path to Bowtie specified as: $path_to_bowtie\n"; } } ### OUTPUT REQUESTED AS BAM FILE if ($bam){ if ($vanilla){ die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n"; } ### PATH TO SAMTOOLS if (defined $samtools_path){ # if Samtools was specified as full command if ($samtools_path =~ /samtools$/){ if (-e $samtools_path){ # Samtools executable found } else{ die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; } } else{ unless ($samtools_path =~ /\/$/){ $samtools_path =~ s/$/\//; } $samtools_path .= 'samtools'; if (-e $samtools_path){ # Samtools executable found } else{ die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; } } warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n"; $bam = 1; } # Check whether Samtools is in the PATH if no path was supplied by the user else{ if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH $samtools_path = `which samtools`; chomp $samtools_path; warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n"; $bam = 1; } } unless (defined $samtools_path){ $bam = 2; warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n"; } sleep (1); } #################################### ### PROCESSING ARGUMENTS ### GENOME FOLDER my $genome_folder = shift @ARGV; # mandatory unless ($genome_folder){ warn "Genome folder was not specified!\n"; print_helpfile(); exit; } ### checking that the genome folder, all subfolders and the required bowtie index files exist unless ($genome_folder =~/\/$/){ $genome_folder =~ s/$/\//; } if (chdir $genome_folder){ my $absolute_genome_folder = getcwd; ## making the genome folder path absolute unless ($absolute_genome_folder =~/\/$/){ $absolute_genome_folder =~ s/$/\//; } warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n"; $genome_folder = $absolute_genome_folder; } else{ die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n"; } my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/"; my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/"; if ($bowtie2){ ### Bowtie 2 (new) ### checking the integrity of $CT_dir chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); foreach my $file(@CT_bowtie_index){ unless (-f $file){ die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n"; } } ### checking the integrity of $GA_dir chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); foreach my $file(@GA_bowtie_index){ unless (-f $file){ die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n"; } } } else{ ### Bowtie 1 (default) ### checking the integrity of $CT_dir chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); foreach my $file(@CT_bowtie_index){ unless (-f $file){ die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; } } ### checking the integrity of $GA_dir chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); foreach my $file(@GA_bowtie_index){ unless (-f $file){ die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; } } } my $CT_index_basename = "${CT_dir}BS_CT"; my $GA_index_basename = "${GA_dir}BS_GA"; ### INPUT OPTIONS ### SEQUENCE FILE FORMAT ### exits if both fastA and FastQ were specified if ($fasta and $fastq){ die "Only one sequence filetype can be specified (fastA or fastQ)\n"; } ### unless fastA is specified explicitely, fastQ sequence format is expected by default if ($fasta){ print "FastA format specified\n"; $sequence_format = 'FASTA'; push @bowtie_options, '-f'; } elsif ($fastq){ print "FastQ format specified\n"; $sequence_format = 'FASTQ'; push @bowtie_options, '-q'; } else{ $fastq = 1; print "FastQ format assumed (by default)\n"; $sequence_format = 'FASTQ'; push @bowtie_options, '-q'; } ### SKIP if ($skip){ warn "Skipping the first $skip reads from the input file\n"; # push @bowtie_options,"-s $skip"; } ### UPTO if ($qupto){ warn "Processing sequences up to read no. $qupto from the input file\n"; if ($bowtie2){ # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2 } else{ # push @bowtie_options,"--qupto $qupto"; } } ### QUALITY VALUES if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){ die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)"; } if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2. # Phred quality values work only when -q is specified unless ($fastq){ die "Phred quality values works only when -q (FASTQ) is specified\n"; } if ($bowtie2){ push @bowtie_options,"--phred33"; } else{ push @bowtie_options,"--phred33-quals"; } } if ($phred64){ # Phred quality values work only when -q is specified unless ($fastq){ die "Phred quality values work only when -q (FASTQ) is specified\n"; } if ($bowtie2){ push @bowtie_options,"--phred64"; } else{ push @bowtie_options,"--phred64-quals"; } } else{ $phred64 = 0; } if ($solexa){ if ($bowtie2){ die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n"; } # Solexa to Phred value conversion works only when -q is specified unless ($fastq){ die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n"; } push @bowtie_options,"--solexa-quals"; } else{ $solexa = 0; } ### ALIGNMENT OPTIONS ### MISMATCHES if (defined $mismatches){ if ($bowtie2){ if ($mismatches == 0 or $mismatches == 1){ push @bowtie_options,"-N $mismatches"; } else{ die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n"; } } else{ if ($mismatches >= 0 and $mismatches <= 3){ push @bowtie_options,"-n $mismatches"; } else{ die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n"; } } } else{ unless ($bowtie2){ push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2 } } ### SEED LENGTH if (defined $seed_length){ if ($bowtie2){ push @bowtie_options,"-L $seed_length"; } else{ push @bowtie_options,"-l $seed_length"; } } ### MISMATCH CEILING if (defined $ceiling){ die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2); push @bowtie_options,"-e $ceiling"; } ### BOWTIE 2 EFFORT OPTIONS ### CONSECUTIVE SEED EXTENSION FAILS if (defined $seed_extension_fails){ die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2); push @bowtie_options,"-D $seed_extension_fails"; } ### RE-SEEDING REPETITIVE SEEDS if (defined $reseed_repetitive_seeds){ die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2); push @bowtie_options,"-R $reseed_repetitive_seeds"; } ### BOWTIE 2 SCORING OPTIONS if ($score_min){ die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2); unless ($score_min =~ /^L,.+,.+$/){ die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; } push @bowtie_options,"--score-min $score_min"; } else{ if ($bowtie2){ push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2 } } ### BOWTIE 2 READ GAP OPTIONS my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend); if ($rdg){ die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2); if ($rdg =~ /^(\d+),(\d+)$/){ $deletion_open = $1; $deletion_extend = $2; } else{ die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; } push @bowtie_options,"--rdg $rdg"; } else{ $deletion_open = 5; $deletion_extend = 3; } ### BOWTIE 2 REFERENCE GAP OPTIONS if ($rfg){ die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2); if ($rfg =~ /^(\d+),(\d+)$/){ $insertion_open = $1; $insertion_extend = $2; } else{ die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; } push @bowtie_options,"--rfg $rfg"; } else{ $insertion_open = 5; $insertion_extend = 3; } ### BOWTIE 2 PARALLELIZATION OPTIONS if (defined $parallel){ die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2); } if ($bowtie2){ if ($parallel){ die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); push @bowtie_options,"-p $parallel"; push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work. print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n"; sleep (2); } } ### REPORTING OPTIONS if ($bowtie2){ push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while if(defined $most_valid_alignments){ warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n"; # push @bowtie_options,"-M $most_valid_alignments";sleep (5); } # else{ # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence # } } else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1 push @bowtie_options,'-k 2'; } ### --BEST if ($bowtie2){ if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n"; } } else{ # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process) unless ($best){ push @bowtie_options,'--best'; } } ### VANILLA BISMARK (BOWTIE 1) OUTPUT if ($vanilla){ if ($bowtie2){ die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n"; } } else{ $vanilla = 0; } ### PAIRED-END MAPPING if ($mates1){ my @mates1 = (split (/,/,$mates1)); die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2); my @mates2 = (split(/,/,$mates2)); unless (scalar @mates1 == scalar @mates2){ die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n"; } while (1){ my $mate1 = shift @mates1; my $mate2 = shift @mates2; last unless ($mate1 and $mate2); push @filenames,"$mate1,$mate2"; } if ($bowtie2){ push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones } if ($old_flag){ warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); sleep(3); } } elsif ($mates2){ die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n"; } ### SINGLE-END MAPPING # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified my $singles; unless ($mates1 and $mates2){ $singles = join (',',@ARGV); unless ($singles){ die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n"; } $singles =~ s/\s/,/g; @filenames = (split(/,/,$singles)); warn "\nFiles to be analysed:\n"; warn "@filenames\n\n"; sleep (3); } ### MININUM INSERT SIZE (PAIRED-END ONLY) if (defined $minins){ die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles); push @bowtie_options,"--minins $minins"; } ### MAXIMUM INSERT SIZE (PAIRED-END ONLY) if (defined $maxins){ die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles); push @bowtie_options,"--maxins $maxins"; } else{ unless ($singles){ push @bowtie_options,'--maxins 500'; } } ### QUIET prints nothing besides alignments (suppresses warnings) if ($quiet){ push @bowtie_options,'--quiet'; } ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option if (defined $chunk){ push @bowtie_options,"--chunkmbs $chunk"; } else{ push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default) } } ### SUMMARY OF ALL BOWTIE OPTIONS my $bowtie_options = join (' ',@bowtie_options); ### STRAND-SPECIFIC LIBRARIES my $directional; if ($non_directional){ die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat); warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n"; sleep (3); $directional = 0; } elsif($pbat){ die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip); die "The option --pbat is currently not working for Bowtie 2. Please run alignments in default (i.e. Bowtie 1) mode!\n" if ($bowtie2); die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta); warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n"; sleep (3); $directional = 0; } else{ warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n"; sleep (3); $directional = 1; # default behaviour } ### UNMAPPED SEQUENCE OUTPUT $unmapped = 0 unless ($unmapped); ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT $multi_map = 0 unless ($multi_map); ### OUTPUT DIRECTORY chdir $parent_dir or die "Failed to move back to current working directory\n"; if ($output_dir){ unless ($output_dir =~ /\/$/){ $output_dir =~ s/$/\//; } if (chdir $output_dir){ $output_dir = getcwd; # making the path absolute unless ($output_dir =~ /\/$/){ $output_dir =~ s/$/\//; } } else{ mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; warn "Created output directory $output_dir!\n\n"; chdir $output_dir or die "Failed to move to $output_dir\n"; $output_dir = getcwd; # making the path absolute unless ($output_dir =~ /\/$/){ $output_dir =~ s/$/\//; } } warn "Output will be written into the directory: $output_dir\n"; } else{ $output_dir = ''; } ### TEMPORARY DIRECTORY for C->T and G->A transcribed files chdir $parent_dir or die "Failed to move back to current working directory\n"; if ($temp_dir){ warn "\nUsing temp directory: $temp_dir\n"; unless ($temp_dir =~ /\/$/){ $temp_dir =~ s/$/\//; } if (chdir $temp_dir){ $temp_dir = getcwd; # making the path absolute unless ($temp_dir =~ /\/$/){ $temp_dir =~ s/$/\//; } } else{ mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n"; warn "Created temporary directory $temp_dir!\n\n"; chdir $temp_dir or die "Failed to move to $temp_dir\n"; $temp_dir = getcwd; # making the path absolute unless ($temp_dir =~ /\/$/){ $temp_dir =~ s/$/\//; } } warn "Temporary files will be written into the directory: $temp_dir\n"; } else{ $temp_dir = ''; } ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE if ($non_bs_mm){ if ($vanilla){ die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; } } ### PREFIX FOR OUTPUT FILES if ($prefix){ # removing trailing dots $prefix =~ s/\.+$//; warn "Using the following prefix for output files: $prefix\n\n"; sleep(1); } return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag); } sub generate_SAM_header{ print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order foreach my $chr (keys %chromosomes){ my $length = length ($chromosomes{$chr}); print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length } print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version } ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format: ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011) sub single_end_SAM_output{ my ($id,$actual_seq,$methylation_call_params,$qual) = @_; my $strand = $methylation_call_params->{$id}->{alignment_strand}; my $chr = $methylation_call_params->{$id}->{chromosome}; my $start = $methylation_call_params->{$id}->{position}; my $stop = $methylation_call_params->{$id}->{end_position}; my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence}; my $methcall = $methylation_call_params->{$id}->{methylation_call}; my $read_conversion = $methylation_call_params->{$id}->{read_conversion}; my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; my $number_of_mismatches; if ($bowtie2){ $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score}; } else{ $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches}; } ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" ## FLAG: bitwise FLAG. Each bit is explained in the following table: ## Bit Description Comment Value ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1) ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2) ## 0x4 segment unmapped --- --- ## 0x8 next segment in the template unmapped --- --- ## 0x10 SEQ being reverse complemented value: 2**4 ( 16) ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32) ## 0x40 the first segment in the template read 1 value: 2**6 ( 64) ## 0x80 the last segment in the template read 2 value: 2**7 (128) ## 0x100 secondary alignment --- --- ## 0x200 not passing quality controls --- --- ## 0x400 PCR or optical duplicate --- --- ##### my $flag; # FLAG variable used for SAM format. if ($strand eq "+"){ if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ $flag = 0; # 0 for "+" strand (OT) } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand) } else{ die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; } } elsif ($strand eq "-"){ if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ $flag = 16; # 16 for "-" strand (OB) } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand) } else{ die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; } } else{ die "Unexpected strand information: $strand\n\n"; } ##### my $mapq = 255; # Assume mapping quality is unavailable ##### my $cigar; if ($bowtie2){ $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2 } else{ $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches) } ##### my $rnext = "*"; # Paired-end variable ##### my $pnext = 0; # Paired-end variable ##### my $tlen = 0; # Paired-end variable ##### if ($read_conversion eq 'CT'){ $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands } else{ $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries } if ($strand eq '-'){ $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well } ##### my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string # into the reference string. hemming_dist() if ($bowtie2){ $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence } my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences ##### my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!) ##### my $XM_tag; # Optional tag XM: Methylation Call String if ($strand eq '+'){ $XM_tag = "XM:Z:$methcall"; } elsif ($strand eq '-'){ $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well } ##### my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion ##### my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion ##### # Optionally calculating number of mismatches for Bowtie 2 alignments if ($non_bs_mm) { if ($bowtie2) { $number_of_mismatches =~ s/-//; # removing the minus sign ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches if ($cigar =~ /(D|I)/) { # warn "$cigar\n"; # parsing CIGAR string my @len = split (/\D+/,$cigar); # storing the length per operation my @ops = split (/\d+/,$cigar); # storing the operation shift @ops; # remove the empty first element die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); foreach (0..$#len) { if ($ops[$_] eq 'M') { # warn "skipping\n"; next; # irrelevant } elsif ($ops[$_] eq 'I') { # insertion in the read sequence $number_of_mismatches -= $insertion_open; $number_of_mismatches -= $len[$_] * $insertion_extend; # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; } elsif ($ops[$_] eq 'D') { # deletion in the read sequence $number_of_mismatches -= $deletion_open; $number_of_mismatches -= $len[$_] * $deletion_extend; # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; } elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; } else { die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; } } # warn "Alignment score $number_of_mismatches\n"; # print "Mismatches $number_of_mismatches\n\n"; } ### Now we have InDel corrected alignment scores ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the ### sequence contained more than 5 Ns, but this should occur close to never my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division # warn "N count: $seq_N_count\n"; $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count; # warn "MM $number_of_mismatches\n"; } } #### my $XA_tag = "XA:Z:$number_of_mismatches"; ##### # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields ### optionally print number of non-bisulfite mismatches if ($non_bs_mm){ print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n"; } else{ # default # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n"; } } sub paired_end_SAM_output{ my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_; my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2}; my $chr = $methylation_call_params->{$id}->{chromosome}; my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1}; my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2}; my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1}; my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; my $id_1; my $id_2; if ($old_flag){ $id_1 = $id.'/1'; $id_2 = $id.'/2'; } else{ $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard $id_2 = $id; } # Allows all degenerate nucleotide sequences in reference genome die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i; die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i; my $index; # used to store the srand origin of the alignment in a less convoluted way if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){ $index = 0; ## this is OT (original top strand) } elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){ $index = 1; ## this is CTOB (complementary to OB) } elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){ $index = 2; ## this is CTOT (complementary to OT) } elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){ $index = 3; ## this is OB (original bottom) } else { die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n"; } my $number_of_mismatches_1; my $number_of_mismatches_2; if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default! $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2}; } else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation if ($index == 2 or $index == 3){ # CTOT or OB $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default! $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1}; } else{ # if the first read aligned in forward direction it is like for Bowtie 2 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default! $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2}; } } ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the ### first or last position. if ($index == 0 or $index == 3){ # OT or OB $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2); $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2); } else{ # CTOT or CTOB $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2); $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2); } ##### my $start_read_1; my $start_read_2; # adjusting end positions if ($bowtie2){ $start_read_1 = $methylation_call_params->{$id}->{position_1}; $start_read_2 = $methylation_call_params->{$id}->{position_2}; } else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand $start_read_1 = $methylation_call_params->{$id}->{start_seq_1}; $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1; } else{ # read 1 is on the - strand $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1; $start_read_2 = $methylation_call_params->{$id}->{start_seq_1}; } } ##### my $end_read_1; my $end_read_2; # adjusting end positions if ($bowtie2){ $end_read_1 = $methylation_call_params->{$id}->{end_position_1}; $end_read_2 = $methylation_call_params->{$id}->{end_position_2}; } else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1; $end_read_2 = $methylation_call_params->{$id}->{alignment_end}; } else{ $end_read_1 = $methylation_call_params->{$id}->{alignment_end}; $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1; } } ##### ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" ## FLAG: bitwise FLAG. Each bit is explained in the following table: ## Bit Description Comment Value ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1) ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2) ## 0x4 segment unmapped --- --- ## 0x8 next segment in the template unmapped --- --- ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16) ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32) ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64) ## 0x80 the last segment in the template read 2 value: 2^^7 (128) ## 0x100 secondary alignment --- --- ## 0x200 not passing quality controls --- --- ## 0x400 PCR or optical duplicate --- --- ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) my $flag_1; # FLAG variable used for SAM format my $flag_2; ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH) if ($index == 0){ # OT unless ($old_flag){ $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) } else{ $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) } } elsif ($index == 1){ # CTOB unless($old_flag){ $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) } else{ $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) } } elsif ($index == 2){ # CTOT unless ($old_flag){ $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) } else{ $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) } } elsif ($index == 3){ # OB unless ($old_flag){ $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) } else{ $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) } } ##### my $mapq = 255; # Mapping quality is unavailable ##### my $cigar_1; my $cigar_2; if ($bowtie2){ $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2}; } else{ $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches) $cigar_2 = length($actual_seq_2) . "M"; } ##### my $rnext = '='; # Chromosome of mate; applies to both reads ##### my $pnext_1 = $start_read_2; # Leftmost position of mate my $pnext_2 = $start_read_1; ##### my $tlen_1; # signed observed Template LENgth (or inferred fragment size) my $tlen_2; if ($bowtie2){ if ($start_read_1 <= $start_read_2){ # Read 1 alignment is leftmost if ($end_read_2 >= $end_read_1){ # -------------------------> read 1 reads overlapping # <------------------------- read 2 # # or # # -------------------------> read 1 # <----------------------- read 2 read 2 contained within read 1 # # or # # -------------------------> read 1 reads 1 and 2 exactly overlapping # <------------------------- read 2 # # dovetailing of reads is not enabled for Bowtie 2 alignments $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign } elsif ($end_read_2 < $end_read_1){ # -------------------------> read 1 # <----------- read 2 read 2 contained within read 1 # # or # # -------------------------> read 1 # <------------------------ read 2 read 2 contained within read 1 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this ### as a request by frozenlyse on SeqAnswers on 24 July 2013 } } elsif ($start_read_2 < $start_read_1){ if ($end_read_1 >= $end_read_2){ # Read 2 alignment is leftmost # -------------------------> read 2 reads overlapping # <------------------------- read 1 # # or # # -------------------------> read 2 # <----------------------- read 1 read 1 contained within read 2 # # $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign } elsif ($end_read_1 < $end_read_2){ # -------------------------> read 2 # <----------- read 1 read 1 contained within read 2 # # or # # -------------------------> read 2 # <------------------------ read 1 read 1 contained within read 2 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this ### as a request by frozenlyse on SeqAnswers on 24 July 2013 } } } else{ # Bowtie 1 if ($end_read_2 >= $end_read_1){ # Read 1 alignment is leftmost # -------------------------> read 1 # <------------------------- read 2 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign } else{ # Read 2 alignment is leftmost # -------------------------> read 2 # <------------------------- read 1 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign } } ##### # adjusting the strand of the sequence before we use them to generate mismatch strings if ($strand_1 eq '-'){ $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence $qual_1 = reverse $qual_1; # we need to reverse the quality string as well } if ($strand_2 eq '-'){ $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well } # print "$actual_seq_1\n$ref_seq_1\n\n"; # print "$actual_seq_2\n$ref_seq_2\n\n"; ##### my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2); if ($bowtie2){ $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence } my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences ##### my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!) my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2); ##### my $XM_tag_1; # Optional tag XM: Methylation call string my $XM_tag_2; if ($strand_1 eq '-'){ $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented } else{ $XM_tag_1 = "XM:Z:$methcall_1"; } if ($strand_2 eq '-'){ $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented } else{ $XM_tag_2 = "XM:Z:$methcall_2"; } ##### my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state ##### my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads ##### # Optionally calculating number of mismatches for Bowtie 2 alignments if ($non_bs_mm) { if ($bowtie2) { $number_of_mismatches_1 =~ s/-//; # removing the minus sign $number_of_mismatches_2 =~ s/-//; ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches ### CIGAR 1 if ($cigar_1 =~ /(D|I)/) { # warn "$cigar_1\n"; # parsing CIGAR string my @len = split (/\D+/,$cigar_1); # storing the length per operation my @ops = split (/\d+/,$cigar_1); # storing the operation shift @ops; # remove the empty first element die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); foreach (0..$#len) { if ($ops[$_] eq 'M') { # warn "skipping\n"; next; # irrelevant } elsif ($ops[$_] eq 'I') { # insertion in the read sequence $number_of_mismatches_1 -= $insertion_open; $number_of_mismatches_1 -= $len[$_] * $insertion_extend; # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; } elsif ($ops[$_] eq 'D') { # deletion in the read sequence $number_of_mismatches_1 -= $deletion_open; $number_of_mismatches_1 -= $len[$_] * $deletion_extend; # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; } elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; } else { die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; } } # warn "Alignment score $number_of_mismatches_1\n"; # print "Mismatches $number_of_mismatches_1\n\n"; } ### CIGAR 2 if ($cigar_2 =~ /(D|I)/) { # warn "$cigar_2\n"; # parsing CIGAR string my @len = split (/\D+/,$cigar_2); # storing the length per operation my @ops = split (/\d+/,$cigar_2); # storing the operation shift @ops; # remove the empty first element die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); foreach (0..$#len) { if ($ops[$_] eq 'M') { # warn "skipping\n"; next; #irrelevant } elsif ($ops[$_] eq 'I') { # insertion in the read sequence $number_of_mismatches_2 -= $insertion_open; $number_of_mismatches_2 -= $len[$_] * $insertion_extend; # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; } elsif ($ops[$_] eq 'D') { # deletion in the read sequence $number_of_mismatches_2 -= $deletion_open; $number_of_mismatches_2 -= $len[$_] * $deletion_extend; # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; } elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; } else { die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; } } } ### Now we have InDel corrected Alignment scores ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the ### sequence contained more than 5 Ns, but this should occur close to never my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division my $seq_2_N_count = $number_of_mismatches_2 % 6; # warn "N count 1: $seq_1_N_count\n"; # warn "N count 2: $seq_2_N_count\n"; $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count; $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count; # warn "MM1 $number_of_mismatches_1 \n"; # warn "MM2 $number_of_mismatches_2 \n"; } } #### my $XA_tag = "XA:Z:$number_of_mismatches_1"; my $XB_tag = "XB:Z:$number_of_mismatches_2"; # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields ### optionally print number of non-bisulfite mismatches if ($non_bs_mm){ print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n"; print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n"; } else{ # default print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n"; print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n"; } } sub revcomp{ my $seq = shift or die "Missing seq to reverse complement\n"; $seq = reverse $seq; $seq =~ tr/ACTGactg/TGACTGAC/; return $seq; } sub hemming_dist{ my $matches = 0; my @actual_seq = split //,(shift @_); my @ref_seq = split //,(shift @_); foreach (0..$#actual_seq){ ++$matches if ($actual_seq[$_] eq $ref_seq[$_]); } return my $hd = scalar @actual_seq - $matches; } sub make_mismatch_string{ my $actual_seq = shift or die "Missing actual sequence"; my $ref_seq = shift or die "Missing reference sequence"; my $XX_tag = "XX:Z:"; my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison my $prev_mm_pos = 0; while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation) $prev_mm_pos = pos($tmp); # Position of last mismatch } my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence return $XX_tag; } sub print_helpfile{ print << "HOW_TO"; This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. DESCRIPTION The following is a brief description of command line options and arguments to control the Bismark bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original sequence from the genome and determine if there were any protected C's present or not. As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be re-enabled by using --non_directional. The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below. USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} ARGUMENTS: <genome_folder> The path to the folder containing the unmodified reference genome as well as the subfolders created by the Bismark_Genome_Preparation script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/). Bismark expects one or more fastA files in this folder (file extension: .fa or .fasta). The path can be relative or absolute. -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <mates2>. Reads may be a mix of different lengths. Bismark will produce one mapping result and one report file per paired-end input file pair. -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must correspond file-for-file and read-for-read with those specified in <mates1>. Reads may be a mix of different lengths. <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g. lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will produce one mapping result and one report file per input file. OPTIONS: Input: -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ files (usually having extension .fg or .fastq). This is the default. See also --solexa-quals. -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA files (usually havin extension .fa, .mfa, .fna or similar). All quality values are assumed to be 40 on the Phred scale. FASTA files are expected to contain both the read name and the sequence on a single line (and not spread over several lines). -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input. -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit. --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled (which can't). The formula for conversion is: phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This is usually the right option for use with (unconverted) reads emitted by the GA Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) reads emitted by GA Pipeline version 1.3 or later. Default: off. --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not specified it is assumed that Bowtie (1 or 2) is in the PATH. Alignment: -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds quality values to the nearest 10 and saturates at 30. This value is not relevant for Bowtie 2. --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in --best mode. Best-first search must keep track of many paths at once to ensure it is always extending the path with the lowest cumulative cost. Bowtie tries to minimize the memory impact of the descriptors, but they can still grow very large in some cases. If you receive an error message saying that chunk memory has been exhausted in --best mode, try adjusting this parameter up to dedicate more memory to the descriptors. This value is not relevant for Bowtie 2. Default: 512. -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and a paired-end alignment consists of two 20-bp alignments in the appropriate orientation with a 20-bp gap between them, that alignment is considered valid (as long as -X is also satisfied). A 19-bp gap would not be valid in that case. Default: 0. -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and a paired-end alignment consists of two 20-bp alignments in the proper orientation with a 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). A 61-bp gap would not be valid in that case. Default: 500. Bowtie 1 Reporting: -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option will be used by default. --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both have Phred quality 10. When --best is not specified, Bowtie may report alignments that are sub-optimal in terms of stratum and/or quality (though an effort is made to report the best alignment). --best mode also removes all strand bias. Note that --best does not affect which alignments are considered "valid" by Bowtie, only which valid alignments are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified. Default: on. --no_best Disables the --best option which is on by default. This can speed up the alignment process, e.g. for testing purposes, but for credible results it is not recommended to disable --best. Output: --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four bisulfite strands will be reported. Default: OFF. (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary to the original strands are merely theoretical and should not exist in reality. Specifying directional alignments (which is the default) will only run 2 alignment threads to the original top (OT) or bottom (OB) strands in parallel and report these alignments. This is the recommended option for sprand-specific libraries). --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al., PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode, as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT and OB ones. Use this option only if you are certain that your libraries were constructed following a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option --pbat works only for single-end and paired-end FastQ files for use with Bowtie1 (uncompressed temporary files only). --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are split up into several smaller files to run concurrently and the output files are to be merged. --quiet Print nothing besides alignments. --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead of SAM format output. -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will appear as they did in the input, without any translation of quality values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest mismatches or other reads that fail to align uniquely to a file in the output directory. Written reads will appear as they did in the input, without any of the translation of quality values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt to create it first. The path to the output folder can be either relative or absolute. --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If the specified folder does not exist, Bismark will attempt to create it first. The path to the temporary folder can be either relative or absolute. --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions. The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches' and 'XB:Z:number of mismatches' for read 2 of paired-end reads. --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk space. This option is available for most alignment modes but is not available for paired-end FastA files. This option might be somewhat slower than writing out uncompressed files, but this awaits further testing. --bam The output will be written out in BAM format instead of the default SAM format. Bismark will attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found, the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified explicitly if Samtools is in the PATH already. --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since both the appended read IDs and custom FLAG values may cause problems with some downstream tools such as Picard, new defaults were implemented as of version 0.8.3. default old_flag =================== =================== Read 1 Read 2 Read 1 Read 2 OT: 99 147 67 131 OB: 83 163 115 179 CTOT: 99 147 67 131 CTOB: 83 163 115 179 Other: -h/--help Displays this help file. -v/--version Displays version information. BOWTIE 2 SPECIFIC OPTIONS --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end alignments, i.e. searches for alignments involving all read characters (also called untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter and/or quality trimmed where appropriate. Default: off. Bowtie 2 alignment options: -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for Bowtie 1 see -n). -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is used by default, which sets -L to 20. This option is only available for Bowtie 2 (for Bowtie 1 see -l). --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched position to be the highest possible, regardless of the actual value. I.e. input is treated as though all quality values are high. This is also the default behavior when the input doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. Bowtie 2 paired-end options: --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if it cannot find a concordant or discordant alignment for a pair. This option is invariable and and on by default. --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. A discordant alignment is an alignment where both mates align uniquely, but that does not satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior and it is on by default. Bowtie 2 effort options: -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using the alignments found so far. A seed extension "fails" if it does not yield a new best or a new second-best alignment. Default: 15. -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of mismatches allowed) at different offsets and searches for more alignments. A read is considered to have repetitive seeds if the total number of seed hits divided by the number of seeds that aligned at least once is greater than 300. Default: 2. Bowtie 2 parallelization options: -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores and synchronize when parsing reads and outputting alignments. Searching for alignments is highly parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will automatically use the option '--reorder', which guarantees that output SAM records are printed in an order corresponding to the order of the reads in the original input file, even when -p is set greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally correspond to input order in that case. Bowtie 2 Scoring options: --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is L,0,-0.2. --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty of <int1> + N * <int2>. Default: 5, 3. --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets a penalty of <int1> + N * <int2>. Default: 5, 3. Bowtie 2 Reporting options: -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the effort expended to find valid alignments. For reference, this used to be the old (now deprecated) description of -M: Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever happens first. Only the best alignment is reported. Information from the other alignments is used to estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not guarantee that the alignment reported is the best possible in terms of alignment score. -M is always used and its default value is set to 10. 'VANILLA' Bismark OUTPUT: Single-end output format (tab-separated): (1) <seq-ID> (2) <read alignment strand> (3) <chromosome> (4) <start position> (5) <end position> (6) <observed bisulfite sequence> (7) <equivalent genomic sequence> (8) <methylation call> (9) <read conversion (10) <genome conversion> (11) <read quality score (Phred33)> Paired-end output format (tab-separated): (1) <seq-ID> (2) <read 1 alignment strand> (3) <chromosome> (4) <start position> (5) <end position> (6) <observed bisulfite sequence 1> (7) <equivalent genomic sequence 1> (8) <methylation call 1> (9) <observed bisulfite sequence 2> (10) <equivalent genomic sequence 2> (11) <methylation call 2> (12) <read 1 conversion (13) <genome conversion> (14) <read 1 quality score (Phred33)> (15) <read 2 quality score (Phred33)> Bismark SAM OUTPUT (default): (1) QNAME (seq-ID) (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!)) (3) RNAME (chromosome) (4) POS (start position) (5) MAPQ (always 255) (6) CIGAR (7) RNEXT (8) PNEXT (9) TLEN (10) SEQ (11) QUAL (Phred33 scale) (12) NM-tag (edit distance to the reference) (13) XX-tag (base-by-base mismatches to the reference. This does not include indels) (14) XM-tag (methylation call string) (15) XR-tag (read conversion state for the alignment) (16) XG-tag (genome conversion state for the alignment) (17) XA/XB-tag (non-bisulfite mismatches) (optional!) Each read of paired-end alignments is written out in a separate line in the above format. Last edited on 07 October 2013. HOW_TO }