Mercurial > repos > bgruening > bismark
diff bismark @ 3:91f07ff056ca draft
Uploaded
author | bgruening |
---|---|
date | Mon, 14 Apr 2014 16:43:14 -0400 |
parents | 62c6da72dd4a |
children |
line wrap: on
line diff
--- a/bismark Wed Aug 21 05:19:54 2013 -0400 +++ b/bismark Mon Apr 14 16:43:14 2014 -0400 @@ -24,7 +24,7 @@ my $parent_dir = getcwd; -my $bismark_version = 'v0.7.12'; +my $bismark_version = 'v0.10.0'; my $command_line = join (" ",@ARGV); ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail @@ -35,7 +35,7 @@ } my @filenames; # will be populated by processing the command line -my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat) = process_command_line(); +my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line(); my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment my %chromosomes; # stores the chromosome sequences of the mouse genome @@ -293,9 +293,13 @@ ### printing all alignments to a results file my $outfile = $filename; + if ($prefix){ + $outfile = "$prefix.$outfile"; + } + if ($bowtie2){ # SAM format is the default for Bowtie 2 - $outfile =~ s/$/_bt2_bismark.sam/; + $outfile =~ s/$/_bismark_bt2.sam/; } elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) $outfile =~ s/$/_bismark.txt/; @@ -327,8 +331,12 @@ ### printing alignment and methylation call summary to a report file my $reportfile = $filename; + if ($prefix){ + $reportfile = "$prefix.$reportfile"; + } + if ($bowtie2){ - $reportfile =~ s/$/_bt2_bismark_SE_report.txt/; + $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; } else{ $reportfile =~ s/$/_bismark_SE_report.txt/; @@ -339,12 +347,19 @@ if ($unmapped){ my $unmapped_file = $filename; + if ($prefix){ + $unmapped_file = "$prefix.$unmapped_file"; + } + $unmapped_file =~ s/$/_unmapped_reads.txt/; open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; print "Unmapped sequences will be written to $output_dir$unmapped_file\n"; } if ($ambiguous){ my $ambiguous_file = $filename; + if ($prefix){ + $ambiguous_file = "$prefix.$ambiguous_file"; + } $ambiguous_file =~ s/$/_ambiguous_reads.txt/; open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; @@ -399,7 +414,12 @@ } ### printing all alignments to a results file - my $outfile = $filename_1; + my $outfile = $filename_1; + + if ($prefix){ + $outfile = "$prefix.$outfile"; + } + if ($bowtie2){ # SAM format is the default Bowtie 2 output $outfile =~ s/$/_bismark_bt2_pe.sam/; } @@ -433,6 +453,10 @@ ### printing alignment and methylation call summary to a report file my $reportfile = $filename_1; + if ($prefix){ + $reportfile = "$prefix.$reportfile"; + } + if ($bowtie2){ $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; } @@ -449,6 +473,10 @@ if ($unmapped){ my $unmapped_1 = $filename_1; my $unmapped_2 = $filename_2; + if ($prefix){ + $unmapped_1 = "$prefix.$unmapped_1"; + $unmapped_2 = "$prefix.$unmapped_2"; + } $unmapped_1 =~ s/$/_unmapped_reads_1.txt/; $unmapped_2 =~ s/$/_unmapped_reads_2.txt/; open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; @@ -459,6 +487,11 @@ if ($ambiguous){ my $amb_1 = $filename_1; my $amb_2 = $filename_2; + if ($prefix){ + $amb_1 = "$prefix.$amb_1"; + $amb_2 = "$prefix.$amb_2"; + } + $amb_1 =~ s/$/_ambiguous_reads_1.txt/; $amb_2 =~ s/$/_ambiguous_reads_2.txt/; open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; @@ -578,19 +611,38 @@ warn "Total number of C's analysed:\t$total_number_of_C\n\n"; warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; - warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + warn "\n"; + + warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + warn "\n"; print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; - print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n"; + + print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; - print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + print REPORT "\n"; + + print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + print REPORT "\n"; my $percent_meCHG; if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ @@ -607,6 +659,12 @@ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); } + my $percent_meC_unknown; + if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ + $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); + } + + ### printing methylated CpG percentage if applicable if ($percent_meCpG){ warn "C methylated in CpG context:\t${percent_meCpG}%\n"; @@ -629,20 +687,117 @@ ### printing methylated C percentage (CHH context) if applicable if ($percent_meCHH){ - warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; - print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + warn "C methylated in CHH context:\t${percent_meCHH}%\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; } else{ - warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; - print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; - } + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + } + + ### printing methylated C percentage (Unknown C context) if applicable + if ($bowtie2){ + if ($percent_meC_unknown){ + warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; + } + } + print REPORT "\n\n"; + warn "\n\n"; if ($seqID_contains_tabs){ warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; } + + + ########################################################################################################################################### + ### create pie-chart with mapping stats + ########################################################################################################################################### + + + my $filename; + if ($pbat){ + $filename = $G_to_A_infile; + } + else{ + $filename = $C_to_T_infile; + } + + my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $pie_chart =~ s/gz$//; + $pie_chart =~ s/_C_to_T\.fastq$//; + $pie_chart =~ s/_G_to_A\.fastq$//; + + # if ($prefix){ + # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation + # } + $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png"; + + + #Check whether the module GD::Graph is installed + my $gd_graph_installed = 0; + eval{ + require GD::Graph::pie; + GD::Graph::pie->import(); + }; + + unless($@) { + $gd_graph_installed = 1; + } + else{ + warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; + sleep(2); + } + + if ($gd_graph_installed){ + warn "Generating pie chart\n\n"; + sleep(1); + my $graph = GD::Graph::pie->new(600,600); + + my $percent_unaligned; + my $percent_multiple; + my $percent_unextractable; + + if ($counting{sequences_count}){ + $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); + $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); + $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); + } + else{ + $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; + } + + my @aln_stats = ( + ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], + [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], + ); + + $graph->set( + start_angle => 180, + '3d' => 0, + label => 'Alignment stats (single-end)', + suppress_angle => 2, # Only label slices of sufficient size + transparent => 0, + dclrs => [ qw(red lorange dgreen cyan) ], + ) or die $graph->error; + + my $gd = $graph->plot(\@aln_stats) or die $graph->error; + + open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; + binmode PIE; + print PIE $gd->png; + } + + warn "====================\nBismark run complete\n====================\n\n"; + } + sub print_final_analysis_report_paired_ends{ my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles @@ -736,18 +891,36 @@ warn "Total number of C's analysed:\t$total_number_of_C\n\n"; warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; - warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; + } + warn "\n"; + + warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; + } + warn "\n"; print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; - print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; - print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; - print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; - print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; + print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; + if ($bowtie2){ + print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; + } + print REPORT "\n"; + + print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; + print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; + print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; + if ($bowtie2){ + print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; + } + print REPORT "\n"; my $percent_meCHG; if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ @@ -764,6 +937,12 @@ $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); } + my $percent_meC_unknown; + if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ + $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); + } + + ### printing methylated CpG percentage if applicable if ($percent_meCpG){ warn "C methylated in CpG context:\t${percent_meCpG}%\n"; @@ -786,13 +965,112 @@ ### printing methylated C percentage in CHH context if applicable if ($percent_meCHH){ - warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; - print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; + warn "C methylated in CHH context:\t${percent_meCHH}%\n"; + print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; + } + + ### printing methylated C percentage (Unknown C context) if applicable + if ($bowtie2){ + if ($percent_meC_unknown){ + warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; + } + else{ + warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; + print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; + } + } + print REPORT "\n\n"; + warn "\n\n"; + + + ############################################################################################################################################ + ### create pie-chart with mapping stats + ########################################################################################################################################### + + my $filename; + if ($pbat){ + $filename = $G_to_A_infile_1; } else{ - warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; - print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; - } + $filename = $C_to_T_infile_1; + } + + my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified + $pie_chart =~ s/gz$//; + $pie_chart =~ s/_C_to_T.fastq$//; + $pie_chart =~ s/_G_to_A.fastq$//; + ### special format for gzipped PE Bowtie1 files + $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//; + $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//; + + if ($prefix){ + # prefix is now being prepended to the temp files already + # $pie_chart = "$prefix.$pie_chart"; + } + $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png"; + + #Check whether the module GD::Graph is installed + my $gd_graph_installed = 0; + eval{ + require GD::Graph::pie; + GD::Graph::pie->import(); + }; + + unless($@) { + $gd_graph_installed = 1; + } + else{ + warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; + sleep(2); + } + + if ($gd_graph_installed){ + warn "Generating pie chart\n\n"; + sleep(1); + my $graph = GD::Graph::pie->new(600,600); + + my $percent_unaligned; + my $percent_multiple; + my $percent_unextractable; + + if ($counting{sequences_count}){ + $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); + $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); + $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); + } + else{ + $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; + } + + my @aln_stats = ( + ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], + [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], + ); + + # push @{$mbias_read1[0]},$pos; + + $graph->set( + start_angle => 180, + '3d' => 0, + label => 'Alignment stats (paired-end)', + suppress_angle => 2, # Only label slices of sufficient size + transparent => 0, + dclrs => [ qw(red lorange dgreen cyan) ], + ) or die $graph->error; + + my $gd = $graph->plot(\@aln_stats) or die $graph->error; + + open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; + binmode PIE; + print PIE $gd->png; + } + + warn "====================\nBismark run complete\n====================\n\n"; } @@ -831,7 +1109,7 @@ } $counting{sequences_count}++; - if ($counting{sequences_count}%100000==0) { + if ($counting{sequences_count}%1000000==0) { warn "Processed $counting{sequences_count} sequences so far\n"; } chomp $sequence; @@ -993,8 +1271,8 @@ } $counting{sequences_count}++; - if ($counting{sequences_count}%100000==0) { - warn "Processed $counting{sequences_count} sequences so far\n"; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequence pairs so far\n"; } my $orig_identifier_1 = $identifier_1; my $orig_identifier_2 = $identifier_2; @@ -1090,8 +1368,8 @@ } $counting{sequences_count}++; - if ($counting{sequences_count}%100000==0) { - warn "Processed $counting{sequences_count} sequences so far\n"; + if ($counting{sequences_count}%1000000==0) { + warn "Processed $counting{sequences_count} sequence pairs so far\n"; } my $orig_identifier_1 = $identifier_1; @@ -2778,12 +3056,12 @@ ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ - warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; $counting{genomic_sequence_could_not_be_extracted_count}++; return 0; } if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ - warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; + warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; $counting{genomic_sequence_could_not_be_extracted_count}++; return 0; } @@ -3077,7 +3355,8 @@ my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; -# print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; + # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; + # sleep(10); ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence @@ -3111,7 +3390,7 @@ my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output my $indels_2 = 0; - + ### Extracting read 1 genomic sequence ### # extracting 2 additional bp at the 5' end (read 1) @@ -3160,6 +3439,7 @@ $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; return; } + $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); } @@ -3209,6 +3489,10 @@ if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise + # need to set read 1 as well now to prevent warning + # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; + # sleep(5); + $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; return; } @@ -3809,6 +4093,8 @@ ### h for not methylated C in CHH context (was converted) ### ### Z for methylated C in CpG context (was protected) ### ### z for not methylated C in CpG context (was converted) ### + ### U for methylated C in unknown context (was protected) ### + ### u for not methylated C in unknwon context (was converted) ### ################################################################# my @match =(); @@ -3816,9 +4102,11 @@ my $methyl_CHH_count = 0; my $methyl_CHG_count = 0; my $methyl_CpG_count = 0; + my $methyl_C_unknown_count = 0; my $unmethylated_CHH_count = 0; my $unmethylated_CHG_count = 0; my $unmethylated_CpG_count = 0; + my $unmethylated_C_unknown_count = 0; if ($read_conversion eq 'CT'){ for my $index (0..$#seq) { @@ -3832,7 +4120,10 @@ ++$methyl_CpG_count; push @match,'Z'; # protected C, methylated, in CpG context } - + elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$methyl_C_unknown_count; + push @match,'U'; # protected C, methylated, in Unknown context + } else { ### C in not in CpG-context, determining the second downstream base context my $second_downstream_base = $genomic[$index+2]; @@ -3841,6 +4132,10 @@ ++$methyl_CHG_count; push @match,'X'; # protected C, methylated, in CHG context } + elsif ($second_downstream_base eq 'N'){ + ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'U'; # protected C, methylated, in Unknown context + } else{ ++$methyl_CHH_count; push @match,'H'; # protected C, methylated, in CHH context @@ -3862,7 +4157,10 @@ ++$unmethylated_CpG_count; push @match,'z'; # converted C, not methylated, in CpG context } - + elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$unmethylated_C_unknown_count; + push @match,'u'; # converted C, not methylated, in Unknown context + } else{ ### C in not in CpG-context, determining the second downstream base context my $second_downstream_base = $genomic[$index+2]; @@ -3871,6 +4169,10 @@ ++$unmethylated_CHG_count; push @match,'x'; # converted C, not methylated, in CHG context } + elsif ($second_downstream_base eq 'N'){ + ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'u'; # converted C, not methylated, in Unknown context + } else{ ++$unmethylated_CHH_count; push @match,'h'; # converted C, not methylated, in CHH context @@ -3903,7 +4205,10 @@ ++$methyl_CpG_count; push @match,'Z'; # protected C on opposing strand, methylated, in CpG context } - + elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$methyl_C_unknown_count; + push @match,'U'; # protected C on opposing strand, methylated, in Unknown context + } else{ ### C in not in CpG-context, determining the second upstream base context my $second_upstream_base = $genomic[$index]; @@ -3912,6 +4217,10 @@ ++$methyl_CHG_count; push @match,'X'; # protected C on opposing strand, methylated, in CHG context } + elsif ($second_upstream_base eq 'N'){ + ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'U'; # protected C, methylated, in Unknown context + } else{ ++$methyl_CHH_count; push @match,'H'; # protected C on opposing strand, methylated, in CHH context @@ -3935,7 +4244,10 @@ ++$unmethylated_CpG_count; push @match,'z'; # converted C on opposing strand, not methylated, in CpG context } - + elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) + ++$unmethylated_C_unknown_count; + push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context + } else{ ### C in not in CpG-context, determining the second upstream base context my $second_upstream_base = $genomic[$index]; @@ -3944,6 +4256,10 @@ ++$unmethylated_CHG_count; push @match,'x'; # converted C on opposing strand, not methylated, in CHG context } + elsif ($second_upstream_base eq 'N'){ + ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) + push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context + } else{ ++$unmethylated_CHH_count; push @match,'h'; # converted C on opposing strand, not methylated, in CHH context @@ -3969,9 +4285,11 @@ $counting{total_meCHH_count} += $methyl_CHH_count; $counting{total_meCHG_count} += $methyl_CHG_count; $counting{total_meCpG_count} += $methyl_CpG_count; + $counting{total_meC_unknown_count} += $methyl_C_unknown_count; $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; + $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; return $methylation_call; @@ -4106,6 +4424,13 @@ $G_to_A_infile =~ s/$/_G_to_A.fa/; } + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; if ($gzip){ @@ -4209,9 +4534,17 @@ } my $C_to_T_infile = my $G_to_A_infile = $filename; + $C_to_T_infile =~ s/$/_C_to_T.fa/; $G_to_A_infile =~ s/$/_G_to_A.fa/; + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + if ($directional){ if ($read_number == 1){ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; @@ -4354,6 +4687,13 @@ my $C_to_T_infile = my $G_to_A_infile = $filename; + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + if ($pbat){ # PBAT-Seq if ($gzip){ $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; @@ -4515,6 +4855,13 @@ $G_to_A_infile =~ s/$/_G_to_A.fastq/; } + if ($prefix){ + # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; + $C_to_T_infile = "$prefix.$C_to_T_infile"; + $G_to_A_infile = "$prefix.$G_to_A_infile"; + # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; + } + if ($directional){ if ($read_number == 1){ warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; @@ -4685,8 +5032,16 @@ my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; + if ($prefix){ + # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; + $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; + $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; + # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; + } + $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; + # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; @@ -5557,9 +5912,11 @@ total_meCHH_count => 0, total_meCHG_count => 0, total_meCpG_count => 0, + total_meC_unknown_count => 0, total_unmethylated_CHH_count => 0, total_unmethylated_CHG_count => 0, total_unmethylated_CpG_count => 0, + total_unmethylated_C_unknown_count => 0, sequences_count => 0, no_single_alignment_found => 0, unsuitable_sequence_count => 0, @@ -5743,6 +6100,8 @@ my $bam; my $gzip; my $pbat; + my $prefix; + my $old_flag; my $command_line = GetOptions ('help|man' => \$help, '1=s' => \$mates1, @@ -5784,6 +6143,8 @@ 'bam' => \$bam, 'gzip' => \$gzip, 'pbat' => \$pbat, + 'prefix=s' => \$prefix, + 'old_flag' => \$old_flag, ); @@ -5939,7 +6300,7 @@ my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); foreach my $file(@CT_bowtie_index){ unless (-f $file){ - die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n"; + die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n"; } } ### checking the integrity of $GA_dir @@ -5947,7 +6308,7 @@ my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); foreach my $file(@GA_bowtie_index){ unless (-f $file){ - die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; + die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n"; } } } @@ -5958,7 +6319,7 @@ my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); foreach my $file(@CT_bowtie_index){ unless (-f $file){ - die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; + die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; } } ### checking the integrity of $GA_dir @@ -5966,7 +6327,7 @@ my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); foreach my $file(@GA_bowtie_index){ unless (-f $file){ - die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; + die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; } } } @@ -6247,6 +6608,11 @@ push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones } + + if ($old_flag){ + warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); + sleep(3); + } } elsif ($mates2){ die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n"; @@ -6400,7 +6766,18 @@ } } - return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat); + ### PREFIX FOR OUTPUT FILES + if ($prefix){ + # removing trailing dots + + $prefix =~ s/\.+$//; + + warn "Using the following prefix for output files: $prefix\n\n"; + sleep(1); + } + + + return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag); } @@ -6638,8 +7015,17 @@ my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; - my $id_1 = $id.'/1'; - my $id_2 = $id.'/2'; + my $id_1; + my $id_2; + + if ($old_flag){ + $id_1 = $id.'/1'; + $id_2 = $id.'/2'; + } + else{ + $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard + $id_2 = $id; + } # Allows all degenerate nucleotide sequences in reference genome die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i; @@ -6759,24 +7145,50 @@ # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) - my $flag_1; # FLAG variable used for SAM format + my $flag_1; # FLAG variable used for SAM format my $flag_2; + ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH) + if ($index == 0){ # OT - $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) - $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) + unless ($old_flag){ + $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) + $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) + } + else{ + $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) + $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) + } } elsif ($index == 1){ # CTOB - $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) - $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) + unless($old_flag){ + $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) + $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) + } + else{ + $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) + } } elsif ($index == 2){ # CTOT - $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) - $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) + unless ($old_flag){ + $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) + $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) + } + else{ + $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) + $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) + } } elsif ($index == 3){ # OB - $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) - $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) + unless ($old_flag){ + $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) + $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) + } + else{ + $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) + $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) + } } ##### @@ -6846,11 +7258,12 @@ # or # # -------------------------> read 1 - # <----------- read 2 read 2 contained within read 1 - - # start and end of read 2 are fully contained within read 1 - $tlen_1 = 0; # Set as 0 when the information is unavailable - $tlen_2 = 0; # Set as 0 when the information is unavailable + # <------------------------ read 2 read 2 contained within read 1 + + # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable + $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, + $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this + ### as a request by frozenlyse on SeqAnswers on 24 July 2013 } } @@ -6882,12 +7295,13 @@ # or # # -------------------------> read 2 - # <----------- read 1 read 1 contained within read 2 + # <------------------------ read 1 read 1 contained within read 2 - # start and end of read 1 are fully contained within read 2 - $tlen_1 = 0; # Set as 0 when the information is unavailable - $tlen_2 = 0; # Set as 0 when the information is unavailable - } + # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable + $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, + $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this + ### as a request by frozenlyse on SeqAnswers on 24 July 2013 + } } } @@ -7353,6 +7767,27 @@ --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified explicitly if Samtools is in the PATH already. +--prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For + example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. + +--old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, + this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since + both the appended read IDs and custom FLAG values may cause problems with some downstream tools + such as Picard, new defaults were implemented as of version 0.8.3. + + + default old_flag + =================== =================== + Read 1 Read 2 Read 1 Read 2 + + OT: 99 147 67 131 + + OB: 83 163 115 179 + + CTOT: 99 147 67 131 + + CTOB: 83 163 115 179 + Other: @@ -7518,7 +7953,7 @@ Each read of paired-end alignments is written out in a separate line in the above format. -Last edited on 10 May 2013. +Last edited on 07 October 2013. HOW_TO }