comparison bismark @ 3:91f07ff056ca draft

Uploaded
author bgruening
date Mon, 14 Apr 2014 16:43:14 -0400
parents 62c6da72dd4a
children
comparison
equal deleted inserted replaced
2:82814a8a2395 3:91f07ff056ca
22 ## You should have received a copy of the GNU General Public License 22 ## You should have received a copy of the GNU General Public License
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>. 23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
24 24
25 25
26 my $parent_dir = getcwd; 26 my $parent_dir = getcwd;
27 my $bismark_version = 'v0.7.12'; 27 my $bismark_version = 'v0.10.0';
28 my $command_line = join (" ",@ARGV); 28 my $command_line = join (" ",@ARGV);
29 29
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail 30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
31 foreach my $arg (@ARGV){ 31 foreach my $arg (@ARGV){
32 if ($arg eq '--solexa1.3-quals'){ 32 if ($arg eq '--solexa1.3-quals'){
33 $arg = '--phred64-quals'; 33 $arg = '--phred64-quals';
34 } 34 }
35 } 35 }
36 my @filenames; # will be populated by processing the command line 36 my @filenames; # will be populated by processing the command line
37 37
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat) = process_command_line(); 38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line();
39 39
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment 40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
41 my %chromosomes; # stores the chromosome sequences of the mouse genome 41 my %chromosomes; # stores the chromosome sequences of the mouse genome
42 my %counting; # counting various events 42 my %counting; # counting various events
43 43
291 $filename = $sequence_file; 291 $filename = $sequence_file;
292 } 292 }
293 293
294 ### printing all alignments to a results file 294 ### printing all alignments to a results file
295 my $outfile = $filename; 295 my $outfile = $filename;
296 if ($prefix){
297 $outfile = "$prefix.$outfile";
298 }
299
296 300
297 if ($bowtie2){ # SAM format is the default for Bowtie 2 301 if ($bowtie2){ # SAM format is the default for Bowtie 2
298 $outfile =~ s/$/_bt2_bismark.sam/; 302 $outfile =~ s/$/_bismark_bt2.sam/;
299 } 303 }
300 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) 304 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
301 $outfile =~ s/$/_bismark.txt/; 305 $outfile =~ s/$/_bismark.txt/;
302 } 306 }
303 else{ # SAM is the default output 307 else{ # SAM is the default output
325 print OUT "Bismark version: $bismark_version\n"; 329 print OUT "Bismark version: $bismark_version\n";
326 } 330 }
327 331
328 ### printing alignment and methylation call summary to a report file 332 ### printing alignment and methylation call summary to a report file
329 my $reportfile = $filename; 333 my $reportfile = $filename;
334 if ($prefix){
335 $reportfile = "$prefix.$reportfile";
336 }
337
330 if ($bowtie2){ 338 if ($bowtie2){
331 $reportfile =~ s/$/_bt2_bismark_SE_report.txt/; 339 $reportfile =~ s/$/_bismark_bt2_SE_report.txt/;
332 } 340 }
333 else{ 341 else{
334 $reportfile =~ s/$/_bismark_SE_report.txt/; 342 $reportfile =~ s/$/_bismark_SE_report.txt/;
335 } 343 }
336 344
337 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; 345 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
338 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; 346 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
339 347
340 if ($unmapped){ 348 if ($unmapped){
341 my $unmapped_file = $filename; 349 my $unmapped_file = $filename;
350 if ($prefix){
351 $unmapped_file = "$prefix.$unmapped_file";
352 }
353
342 $unmapped_file =~ s/$/_unmapped_reads.txt/; 354 $unmapped_file =~ s/$/_unmapped_reads.txt/;
343 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; 355 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
344 print "Unmapped sequences will be written to $output_dir$unmapped_file\n"; 356 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
345 } 357 }
346 if ($ambiguous){ 358 if ($ambiguous){
347 my $ambiguous_file = $filename; 359 my $ambiguous_file = $filename;
360 if ($prefix){
361 $ambiguous_file = "$prefix.$ambiguous_file";
362 }
348 $ambiguous_file =~ s/$/_ambiguous_reads.txt/; 363 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
349 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; 364 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
350 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; 365 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
351 } 366 }
352 367
397 else{ 412 else{
398 $filename_2 = $sequence_file_2; 413 $filename_2 = $sequence_file_2;
399 } 414 }
400 415
401 ### printing all alignments to a results file 416 ### printing all alignments to a results file
402 my $outfile = $filename_1; 417 my $outfile = $filename_1;
418
419 if ($prefix){
420 $outfile = "$prefix.$outfile";
421 }
422
403 if ($bowtie2){ # SAM format is the default Bowtie 2 output 423 if ($bowtie2){ # SAM format is the default Bowtie 2 output
404 $outfile =~ s/$/_bismark_bt2_pe.sam/; 424 $outfile =~ s/$/_bismark_bt2_pe.sam/;
405 } 425 }
406 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) 426 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
407 $outfile =~ s/$/_bismark_pe.txt/; 427 $outfile =~ s/$/_bismark_pe.txt/;
431 print OUT "Bismark version: $bismark_version\n"; 451 print OUT "Bismark version: $bismark_version\n";
432 } 452 }
433 453
434 ### printing alignment and methylation call summary to a report file 454 ### printing alignment and methylation call summary to a report file
435 my $reportfile = $filename_1; 455 my $reportfile = $filename_1;
456 if ($prefix){
457 $reportfile = "$prefix.$reportfile";
458 }
459
436 if ($bowtie2){ 460 if ($bowtie2){
437 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; 461 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
438 } 462 }
439 else{ 463 else{
440 $reportfile =~ s/$/_bismark_PE_report.txt/; 464 $reportfile =~ s/$/_bismark_PE_report.txt/;
447 471
448 ### Unmapped read output 472 ### Unmapped read output
449 if ($unmapped){ 473 if ($unmapped){
450 my $unmapped_1 = $filename_1; 474 my $unmapped_1 = $filename_1;
451 my $unmapped_2 = $filename_2; 475 my $unmapped_2 = $filename_2;
476 if ($prefix){
477 $unmapped_1 = "$prefix.$unmapped_1";
478 $unmapped_2 = "$prefix.$unmapped_2";
479 }
452 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/; 480 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
453 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/; 481 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
454 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; 482 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
455 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; 483 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
456 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; 484 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
457 } 485 }
458 486
459 if ($ambiguous){ 487 if ($ambiguous){
460 my $amb_1 = $filename_1; 488 my $amb_1 = $filename_1;
461 my $amb_2 = $filename_2; 489 my $amb_2 = $filename_2;
490 if ($prefix){
491 $amb_1 = "$prefix.$amb_1";
492 $amb_2 = "$prefix.$amb_2";
493 }
494
462 $amb_1 =~ s/$/_ambiguous_reads_1.txt/; 495 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
463 $amb_2 =~ s/$/_ambiguous_reads_2.txt/; 496 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
464 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; 497 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
465 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; 498 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
466 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; 499 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
576 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; 609 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
577 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; 610 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
578 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; 611 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
579 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; 612 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
580 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; 613 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
581 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; 614 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
582 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; 615 if ($bowtie2){
583 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; 616 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
584 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; 617 }
618 warn "\n";
619
620 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
621 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
622 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
623 if ($bowtie2){
624 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
625 }
626 warn "\n";
585 627
586 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; 628 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
587 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; 629 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
588 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n"; 630
631 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
589 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; 632 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
590 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; 633 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
591 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; 634 if ($bowtie2){
592 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; 635 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
593 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; 636 }
637 print REPORT "\n";
638
639 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
640 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
641 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
642 if ($bowtie2){
643 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
644 }
645 print REPORT "\n";
594 646
595 my $percent_meCHG; 647 my $percent_meCHG;
596 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ 648 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
597 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); 649 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
598 } 650 }
604 656
605 my $percent_meCpG; 657 my $percent_meCpG;
606 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ 658 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
607 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); 659 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
608 } 660 }
661
662 my $percent_meC_unknown;
663 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
664 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
665 }
666
609 667
610 ### printing methylated CpG percentage if applicable 668 ### printing methylated CpG percentage if applicable
611 if ($percent_meCpG){ 669 if ($percent_meCpG){
612 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; 670 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
613 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; 671 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
627 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; 685 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
628 } 686 }
629 687
630 ### printing methylated C percentage (CHH context) if applicable 688 ### printing methylated C percentage (CHH context) if applicable
631 if ($percent_meCHH){ 689 if ($percent_meCHH){
632 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; 690 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
633 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; 691 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
634 } 692 }
635 else{ 693 else{
636 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; 694 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
637 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; 695 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
638 } 696 }
697
698 ### printing methylated C percentage (Unknown C context) if applicable
699 if ($bowtie2){
700 if ($percent_meC_unknown){
701 warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
702 print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
703 }
704 else{
705 warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
706 print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
707 }
708 }
709 print REPORT "\n\n";
710 warn "\n\n";
639 711
640 if ($seqID_contains_tabs){ 712 if ($seqID_contains_tabs){
641 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; 713 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
642 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; 714 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
643 } 715 }
716
717
718 ###########################################################################################################################################
719 ### create pie-chart with mapping stats
720 ###########################################################################################################################################
721
722
723 my $filename;
724 if ($pbat){
725 $filename = $G_to_A_infile;
726 }
727 else{
728 $filename = $C_to_T_infile;
729 }
730
731 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
732 $pie_chart =~ s/gz$//;
733 $pie_chart =~ s/_C_to_T\.fastq$//;
734 $pie_chart =~ s/_G_to_A\.fastq$//;
735
736 # if ($prefix){
737 # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation
738 # }
739 $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png";
740
741
742 #Check whether the module GD::Graph is installed
743 my $gd_graph_installed = 0;
744 eval{
745 require GD::Graph::pie;
746 GD::Graph::pie->import();
747 };
748
749 unless($@) {
750 $gd_graph_installed = 1;
751 }
752 else{
753 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
754 sleep(2);
755 }
756
757 if ($gd_graph_installed){
758 warn "Generating pie chart\n\n";
759 sleep(1);
760 my $graph = GD::Graph::pie->new(600,600);
761
762 my $percent_unaligned;
763 my $percent_multiple;
764 my $percent_unextractable;
765
766 if ($counting{sequences_count}){
767 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
768 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
769 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
770 }
771 else{
772 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
773 }
774
775 my @aln_stats = (
776 ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
777 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
778 );
779
780 $graph->set(
781 start_angle => 180,
782 '3d' => 0,
783 label => 'Alignment stats (single-end)',
784 suppress_angle => 2, # Only label slices of sufficient size
785 transparent => 0,
786 dclrs => [ qw(red lorange dgreen cyan) ],
787 ) or die $graph->error;
788
789 my $gd = $graph->plot(\@aln_stats) or die $graph->error;
790
791 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
792 binmode PIE;
793 print PIE $gd->png;
794 }
795
796 warn "====================\nBismark run complete\n====================\n\n";
797
644 } 798 }
799
645 800
646 sub print_final_analysis_report_paired_ends{ 801 sub print_final_analysis_report_paired_ends{
647 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; 802 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
648 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles 803 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
649 if ($directional){ 804 if ($directional){
734 889
735 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; 890 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
736 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; 891 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
737 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; 892 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
738 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; 893 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
739 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; 894 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
740 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; 895 if ($bowtie2){
741 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; 896 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
742 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; 897 }
898 warn "\n";
899
900 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
901 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
902 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
903 if ($bowtie2){
904 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
905 }
906 warn "\n";
743 907
744 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; 908 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
745 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; 909 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
746 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; 910 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
747 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; 911 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
748 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; 912 if ($bowtie2){
749 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; 913 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
750 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; 914 }
915 print REPORT "\n";
916
917 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
918 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
919 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
920 if ($bowtie2){
921 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
922 }
923 print REPORT "\n";
751 924
752 my $percent_meCHG; 925 my $percent_meCHG;
753 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ 926 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
754 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); 927 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
755 } 928 }
761 934
762 my $percent_meCpG; 935 my $percent_meCpG;
763 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ 936 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
764 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); 937 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
765 } 938 }
939
940 my $percent_meC_unknown;
941 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
942 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
943 }
944
766 945
767 ### printing methylated CpG percentage if applicable 946 ### printing methylated CpG percentage if applicable
768 if ($percent_meCpG){ 947 if ($percent_meCpG){
769 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; 948 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
770 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; 949 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
784 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; 963 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
785 } 964 }
786 965
787 ### printing methylated C percentage in CHH context if applicable 966 ### printing methylated C percentage in CHH context if applicable
788 if ($percent_meCHH){ 967 if ($percent_meCHH){
789 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; 968 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
790 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; 969 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
791 } 970 }
792 else{ 971 else{
793 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; 972 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
794 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; 973 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
795 } 974 }
975
976 ### printing methylated C percentage (Unknown C context) if applicable
977 if ($bowtie2){
978 if ($percent_meC_unknown){
979 warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
980 print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
981 }
982 else{
983 warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
984 print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
985 }
986 }
987 print REPORT "\n\n";
988 warn "\n\n";
989
990
991 ############################################################################################################################################
992 ### create pie-chart with mapping stats
993 ###########################################################################################################################################
994
995 my $filename;
996 if ($pbat){
997 $filename = $G_to_A_infile_1;
998 }
999 else{
1000 $filename = $C_to_T_infile_1;
1001 }
1002
1003 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
1004 $pie_chart =~ s/gz$//;
1005 $pie_chart =~ s/_C_to_T.fastq$//;
1006 $pie_chart =~ s/_G_to_A.fastq$//;
1007 ### special format for gzipped PE Bowtie1 files
1008 $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//;
1009 $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//;
1010
1011 if ($prefix){
1012 # prefix is now being prepended to the temp files already
1013 # $pie_chart = "$prefix.$pie_chart";
1014 }
1015 $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png";
1016
1017 #Check whether the module GD::Graph is installed
1018 my $gd_graph_installed = 0;
1019 eval{
1020 require GD::Graph::pie;
1021 GD::Graph::pie->import();
1022 };
1023
1024 unless($@) {
1025 $gd_graph_installed = 1;
1026 }
1027 else{
1028 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
1029 sleep(2);
1030 }
1031
1032 if ($gd_graph_installed){
1033 warn "Generating pie chart\n\n";
1034 sleep(1);
1035 my $graph = GD::Graph::pie->new(600,600);
1036
1037 my $percent_unaligned;
1038 my $percent_multiple;
1039 my $percent_unextractable;
1040
1041 if ($counting{sequences_count}){
1042 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
1043 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
1044 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
1045 }
1046 else{
1047 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
1048 }
1049
1050 my @aln_stats = (
1051 ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
1052 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
1053 );
1054
1055 # push @{$mbias_read1[0]},$pos;
1056
1057 $graph->set(
1058 start_angle => 180,
1059 '3d' => 0,
1060 label => 'Alignment stats (paired-end)',
1061 suppress_angle => 2, # Only label slices of sufficient size
1062 transparent => 0,
1063 dclrs => [ qw(red lorange dgreen cyan) ],
1064 ) or die $graph->error;
1065
1066 my $gd = $graph->plot(\@aln_stats) or die $graph->error;
1067
1068 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
1069 binmode PIE;
1070 print PIE $gd->png;
1071 }
1072
1073 warn "====================\nBismark run complete\n====================\n\n";
796 1074
797 } 1075 }
798 1076
799 sub process_single_end_fastA_file_for_methylation_call{ 1077 sub process_single_end_fastA_file_for_methylation_call{
800 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; 1078 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
829 if ($upto){ 1107 if ($upto){
830 last if ($count > $upto); 1108 last if ($count > $upto);
831 } 1109 }
832 1110
833 $counting{sequences_count}++; 1111 $counting{sequences_count}++;
834 if ($counting{sequences_count}%100000==0) { 1112 if ($counting{sequences_count}%1000000==0) {
835 warn "Processed $counting{sequences_count} sequences so far\n"; 1113 warn "Processed $counting{sequences_count} sequences so far\n";
836 } 1114 }
837 chomp $sequence; 1115 chomp $sequence;
838 chomp $identifier; 1116 chomp $identifier;
839 1117
991 if ($upto){ 1269 if ($upto){
992 last if ($count > $upto); 1270 last if ($count > $upto);
993 } 1271 }
994 1272
995 $counting{sequences_count}++; 1273 $counting{sequences_count}++;
996 if ($counting{sequences_count}%100000==0) { 1274 if ($counting{sequences_count}%1000000==0) {
997 warn "Processed $counting{sequences_count} sequences so far\n"; 1275 warn "Processed $counting{sequences_count} sequence pairs so far\n";
998 } 1276 }
999 my $orig_identifier_1 = $identifier_1; 1277 my $orig_identifier_1 = $identifier_1;
1000 my $orig_identifier_2 = $identifier_2; 1278 my $orig_identifier_2 = $identifier_2;
1001 1279
1002 chomp $sequence_1; 1280 chomp $sequence_1;
1088 if ($upto){ 1366 if ($upto){
1089 last if ($count > $upto); 1367 last if ($count > $upto);
1090 } 1368 }
1091 1369
1092 $counting{sequences_count}++; 1370 $counting{sequences_count}++;
1093 if ($counting{sequences_count}%100000==0) { 1371 if ($counting{sequences_count}%1000000==0) {
1094 warn "Processed $counting{sequences_count} sequences so far\n"; 1372 warn "Processed $counting{sequences_count} sequence pairs so far\n";
1095 } 1373 }
1096 1374
1097 my $orig_identifier_1 = $identifier_1; 1375 my $orig_identifier_1 = $identifier_1;
1098 my $orig_identifier_2 = $identifier_2; 1376 my $orig_identifier_2 = $identifier_2;
1099 1377
2776 $counting{unique_best_alignment_count}++; 3054 $counting{unique_best_alignment_count}++;
2777 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); 3055 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
2778 3056
2779 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call 3057 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2780 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ 3058 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2781 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; 3059 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n";
2782 $counting{genomic_sequence_could_not_be_extracted_count}++; 3060 $counting{genomic_sequence_could_not_be_extracted_count}++;
2783 return 0; 3061 return 0;
2784 } 3062 }
2785 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ 3063 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2786 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; 3064 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n";
2787 $counting{genomic_sequence_could_not_be_extracted_count}++; 3065 $counting{genomic_sequence_could_not_be_extracted_count}++;
2788 return 0; 3066 return 0;
2789 } 3067 }
2790 3068
2791 ### now we are set to perform the actual methylation call 3069 ### now we are set to perform the actual methylation call
3075 3353
3076 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; 3354 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
3077 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; 3355 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
3078 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; 3356 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
3079 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; 3357 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
3080 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; 3358 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
3359 # sleep(10);
3081 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and 3360 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
3082 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence 3361 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
3083 3362
3084 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, 3363 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3085 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions 3364 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3109 shift @ops_2; # remove the empty first element 3388 shift @ops_2; # remove the empty first element
3110 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); 3389 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
3111 3390
3112 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output 3391 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
3113 my $indels_2 = 0; 3392 my $indels_2 = 0;
3114 3393
3115 ### Extracting read 1 genomic sequence ### 3394 ### Extracting read 1 genomic sequence ###
3116 3395
3117 # extracting 2 additional bp at the 5' end (read 1) 3396 # extracting 2 additional bp at the 5' end (read 1)
3118 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ 3397 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3119 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome 3398 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3158 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome 3437 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3159 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise 3438 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
3160 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; 3439 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3161 return; 3440 return;
3162 } 3441 }
3442
3163 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); 3443 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
3164 } 3444 }
3165 3445
3166 3446
3167 ### Extracting read 2 genomic sequence ### 3447 ### Extracting read 2 genomic sequence ###
3207 3487
3208 ### 3' end of read 2 3488 ### 3' end of read 2
3209 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ 3489 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3210 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome 3490 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3211 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise 3491 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
3492 # need to set read 1 as well now to prevent warning
3493 # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n";
3494 # sleep(5);
3495 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3212 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; 3496 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3213 return; 3497 return;
3214 } 3498 }
3215 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); 3499 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
3216 } 3500 }
3807 ### x for not methylated C in CHG context (was converted) ### 4091 ### x for not methylated C in CHG context (was converted) ###
3808 ### H for methylated C in CHH context (was protected) ### 4092 ### H for methylated C in CHH context (was protected) ###
3809 ### h for not methylated C in CHH context (was converted) ### 4093 ### h for not methylated C in CHH context (was converted) ###
3810 ### Z for methylated C in CpG context (was protected) ### 4094 ### Z for methylated C in CpG context (was protected) ###
3811 ### z for not methylated C in CpG context (was converted) ### 4095 ### z for not methylated C in CpG context (was converted) ###
4096 ### U for methylated C in unknown context (was protected) ###
4097 ### u for not methylated C in unknwon context (was converted) ###
3812 ################################################################# 4098 #################################################################
3813 4099
3814 my @match =(); 4100 my @match =();
3815 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 4101 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
3816 my $methyl_CHH_count = 0; 4102 my $methyl_CHH_count = 0;
3817 my $methyl_CHG_count = 0; 4103 my $methyl_CHG_count = 0;
3818 my $methyl_CpG_count = 0; 4104 my $methyl_CpG_count = 0;
4105 my $methyl_C_unknown_count = 0;
3819 my $unmethylated_CHH_count = 0; 4106 my $unmethylated_CHH_count = 0;
3820 my $unmethylated_CHG_count = 0; 4107 my $unmethylated_CHG_count = 0;
3821 my $unmethylated_CpG_count = 0; 4108 my $unmethylated_CpG_count = 0;
4109 my $unmethylated_C_unknown_count = 0;
3822 4110
3823 if ($read_conversion eq 'CT'){ 4111 if ($read_conversion eq 'CT'){
3824 for my $index (0..$#seq) { 4112 for my $index (0..$#seq) {
3825 if ($seq[$index] eq $genomic[$index]) { 4113 if ($seq[$index] eq $genomic[$index]) {
3826 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation 4114 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
3830 4118
3831 if ($downstream_base eq 'G'){ 4119 if ($downstream_base eq 'G'){
3832 ++$methyl_CpG_count; 4120 ++$methyl_CpG_count;
3833 push @match,'Z'; # protected C, methylated, in CpG context 4121 push @match,'Z'; # protected C, methylated, in CpG context
3834 } 4122 }
3835 4123 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
4124 ++$methyl_C_unknown_count;
4125 push @match,'U'; # protected C, methylated, in Unknown context
4126 }
3836 else { 4127 else {
3837 ### C in not in CpG-context, determining the second downstream base context 4128 ### C in not in CpG-context, determining the second downstream base context
3838 my $second_downstream_base = $genomic[$index+2]; 4129 my $second_downstream_base = $genomic[$index+2];
3839 4130
3840 if ($second_downstream_base eq 'G'){ 4131 if ($second_downstream_base eq 'G'){
3841 ++$methyl_CHG_count; 4132 ++$methyl_CHG_count;
3842 push @match,'X'; # protected C, methylated, in CHG context 4133 push @match,'X'; # protected C, methylated, in CHG context
4134 }
4135 elsif ($second_downstream_base eq 'N'){
4136 ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
4137 push @match,'U'; # protected C, methylated, in Unknown context
3843 } 4138 }
3844 else{ 4139 else{
3845 ++$methyl_CHH_count; 4140 ++$methyl_CHH_count;
3846 push @match,'H'; # protected C, methylated, in CHH context 4141 push @match,'H'; # protected C, methylated, in CHH context
3847 } 4142 }
3860 4155
3861 if ($downstream_base eq 'G'){ 4156 if ($downstream_base eq 'G'){
3862 ++$unmethylated_CpG_count; 4157 ++$unmethylated_CpG_count;
3863 push @match,'z'; # converted C, not methylated, in CpG context 4158 push @match,'z'; # converted C, not methylated, in CpG context
3864 } 4159 }
3865 4160 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
4161 ++$unmethylated_C_unknown_count;
4162 push @match,'u'; # converted C, not methylated, in Unknown context
4163 }
3866 else{ 4164 else{
3867 ### C in not in CpG-context, determining the second downstream base context 4165 ### C in not in CpG-context, determining the second downstream base context
3868 my $second_downstream_base = $genomic[$index+2]; 4166 my $second_downstream_base = $genomic[$index+2];
3869 4167
3870 if ($second_downstream_base eq 'G'){ 4168 if ($second_downstream_base eq 'G'){
3871 ++$unmethylated_CHG_count; 4169 ++$unmethylated_CHG_count;
3872 push @match,'x'; # converted C, not methylated, in CHG context 4170 push @match,'x'; # converted C, not methylated, in CHG context
4171 }
4172 elsif ($second_downstream_base eq 'N'){
4173 ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
4174 push @match,'u'; # converted C, not methylated, in Unknown context
3873 } 4175 }
3874 else{ 4176 else{
3875 ++$unmethylated_CHH_count; 4177 ++$unmethylated_CHH_count;
3876 push @match,'h'; # converted C, not methylated, in CHH context 4178 push @match,'h'; # converted C, not methylated, in CHH context
3877 } 4179 }
3901 4203
3902 if ($upstream_base eq 'C'){ 4204 if ($upstream_base eq 'C'){
3903 ++$methyl_CpG_count; 4205 ++$methyl_CpG_count;
3904 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context 4206 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
3905 } 4207 }
3906 4208 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
4209 ++$methyl_C_unknown_count;
4210 push @match,'U'; # protected C on opposing strand, methylated, in Unknown context
4211 }
3907 else{ 4212 else{
3908 ### C in not in CpG-context, determining the second upstream base context 4213 ### C in not in CpG-context, determining the second upstream base context
3909 my $second_upstream_base = $genomic[$index]; 4214 my $second_upstream_base = $genomic[$index];
3910 4215
3911 if ($second_upstream_base eq 'C'){ 4216 if ($second_upstream_base eq 'C'){
3912 ++$methyl_CHG_count; 4217 ++$methyl_CHG_count;
3913 push @match,'X'; # protected C on opposing strand, methylated, in CHG context 4218 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
4219 }
4220 elsif ($second_upstream_base eq 'N'){
4221 ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
4222 push @match,'U'; # protected C, methylated, in Unknown context
3914 } 4223 }
3915 else{ 4224 else{
3916 ++$methyl_CHH_count; 4225 ++$methyl_CHH_count;
3917 push @match,'H'; # protected C on opposing strand, methylated, in CHH context 4226 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
3918 } 4227 }
3933 4242
3934 if ($upstream_base eq 'C'){ 4243 if ($upstream_base eq 'C'){
3935 ++$unmethylated_CpG_count; 4244 ++$unmethylated_CpG_count;
3936 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context 4245 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
3937 } 4246 }
3938 4247 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
4248 ++$unmethylated_C_unknown_count;
4249 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
4250 }
3939 else{ 4251 else{
3940 ### C in not in CpG-context, determining the second upstream base context 4252 ### C in not in CpG-context, determining the second upstream base context
3941 my $second_upstream_base = $genomic[$index]; 4253 my $second_upstream_base = $genomic[$index];
3942 4254
3943 if ($second_upstream_base eq 'C'){ 4255 if ($second_upstream_base eq 'C'){
3944 ++$unmethylated_CHG_count; 4256 ++$unmethylated_CHG_count;
3945 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context 4257 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
4258 }
4259 elsif ($second_upstream_base eq 'N'){
4260 ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
4261 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
3946 } 4262 }
3947 else{ 4263 else{
3948 ++$unmethylated_CHH_count; 4264 ++$unmethylated_CHH_count;
3949 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context 4265 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
3950 } 4266 }
3967 my $methylation_call = join ("",@match); 4283 my $methylation_call = join ("",@match);
3968 4284
3969 $counting{total_meCHH_count} += $methyl_CHH_count; 4285 $counting{total_meCHH_count} += $methyl_CHH_count;
3970 $counting{total_meCHG_count} += $methyl_CHG_count; 4286 $counting{total_meCHG_count} += $methyl_CHG_count;
3971 $counting{total_meCpG_count} += $methyl_CpG_count; 4287 $counting{total_meCpG_count} += $methyl_CpG_count;
4288 $counting{total_meC_unknown_count} += $methyl_C_unknown_count;
3972 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; 4289 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
3973 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; 4290 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
3974 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; 4291 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
4292 $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count;
3975 4293
3976 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; 4294 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
3977 return $methylation_call; 4295 return $methylation_call;
3978 } 4296 }
3979 4297
4104 else{ 4422 else{
4105 $C_to_T_infile =~ s/$/_C_to_T.fa/; 4423 $C_to_T_infile =~ s/$/_C_to_T.fa/;
4106 $G_to_A_infile =~ s/$/_G_to_A.fa/; 4424 $G_to_A_infile =~ s/$/_G_to_A.fa/;
4107 } 4425 }
4108 4426
4427 if ($prefix){
4428 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
4429 $C_to_T_infile = "$prefix.$C_to_T_infile";
4430 $G_to_A_infile = "$prefix.$G_to_A_infile";
4431 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
4432 }
4433
4109 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; 4434 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4110 4435
4111 if ($gzip){ 4436 if ($gzip){
4112 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; 4437 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
4113 } 4438 }
4207 warn "Processing reads up to sequence no. $upto from $file\n"; 4532 warn "Processing reads up to sequence no. $upto from $file\n";
4208 sleep (1); 4533 sleep (1);
4209 } 4534 }
4210 4535
4211 my $C_to_T_infile = my $G_to_A_infile = $filename; 4536 my $C_to_T_infile = my $G_to_A_infile = $filename;
4537
4212 $C_to_T_infile =~ s/$/_C_to_T.fa/; 4538 $C_to_T_infile =~ s/$/_C_to_T.fa/;
4213 $G_to_A_infile =~ s/$/_G_to_A.fa/; 4539 $G_to_A_infile =~ s/$/_G_to_A.fa/;
4540
4541 if ($prefix){
4542 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
4543 $C_to_T_infile = "$prefix.$C_to_T_infile";
4544 $G_to_A_infile = "$prefix.$G_to_A_infile";
4545 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
4546 }
4214 4547
4215 if ($directional){ 4548 if ($directional){
4216 if ($read_number == 1){ 4549 if ($read_number == 1){
4217 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; 4550 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4218 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; 4551 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4351 warn "Processing reads up to sequence no. $upto from $file\n"; 4684 warn "Processing reads up to sequence no. $upto from $file\n";
4352 sleep (1); 4685 sleep (1);
4353 } 4686 }
4354 4687
4355 my $C_to_T_infile = my $G_to_A_infile = $filename; 4688 my $C_to_T_infile = my $G_to_A_infile = $filename;
4689
4690 if ($prefix){
4691 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
4692 $C_to_T_infile = "$prefix.$C_to_T_infile";
4693 $G_to_A_infile = "$prefix.$G_to_A_infile";
4694 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
4695 }
4356 4696
4357 if ($pbat){ # PBAT-Seq 4697 if ($pbat){ # PBAT-Seq
4358 if ($gzip){ 4698 if ($gzip){
4359 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; 4699 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
4360 } 4700 }
4511 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; 4851 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
4512 } 4852 }
4513 else{ 4853 else{
4514 $C_to_T_infile =~ s/$/_C_to_T.fastq/; 4854 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
4515 $G_to_A_infile =~ s/$/_G_to_A.fastq/; 4855 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4856 }
4857
4858 if ($prefix){
4859 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
4860 $C_to_T_infile = "$prefix.$C_to_T_infile";
4861 $G_to_A_infile = "$prefix.$G_to_A_infile";
4862 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
4516 } 4863 }
4517 4864
4518 if ($directional){ 4865 if ($directional){
4519 if ($read_number == 1){ 4866 if ($read_number == 1){
4520 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; 4867 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4683 sleep (1); 5030 sleep (1);
4684 } 5031 }
4685 5032
4686 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; 5033 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename;
4687 5034
5035 if ($prefix){
5036 # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n";
5037 $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile";
5038 $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile";
5039 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
5040 }
5041
4688 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; 5042 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/;
4689 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; 5043 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/;
5044 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
4690 5045
4691 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; 5046 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n";
4692 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; 5047 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n";
4693 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; 5048 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n";
4694 5049
5555 my $filename = shift; 5910 my $filename = shift;
5556 %counting=( 5911 %counting=(
5557 total_meCHH_count => 0, 5912 total_meCHH_count => 0,
5558 total_meCHG_count => 0, 5913 total_meCHG_count => 0,
5559 total_meCpG_count => 0, 5914 total_meCpG_count => 0,
5915 total_meC_unknown_count => 0,
5560 total_unmethylated_CHH_count => 0, 5916 total_unmethylated_CHH_count => 0,
5561 total_unmethylated_CHG_count => 0, 5917 total_unmethylated_CHG_count => 0,
5562 total_unmethylated_CpG_count => 0, 5918 total_unmethylated_CpG_count => 0,
5919 total_unmethylated_C_unknown_count => 0,
5563 sequences_count => 0, 5920 sequences_count => 0,
5564 no_single_alignment_found => 0, 5921 no_single_alignment_found => 0,
5565 unsuitable_sequence_count => 0, 5922 unsuitable_sequence_count => 0,
5566 genomic_sequence_could_not_be_extracted_count => 0, 5923 genomic_sequence_could_not_be_extracted_count => 0,
5567 unique_best_alignment_count => 0, 5924 unique_best_alignment_count => 0,
5741 my $non_bs_mm; 6098 my $non_bs_mm;
5742 my $samtools_path; 6099 my $samtools_path;
5743 my $bam; 6100 my $bam;
5744 my $gzip; 6101 my $gzip;
5745 my $pbat; 6102 my $pbat;
6103 my $prefix;
6104 my $old_flag;
5746 6105
5747 my $command_line = GetOptions ('help|man' => \$help, 6106 my $command_line = GetOptions ('help|man' => \$help,
5748 '1=s' => \$mates1, 6107 '1=s' => \$mates1,
5749 '2=s' => \$mates2, 6108 '2=s' => \$mates2,
5750 'path_to_bowtie=s' => \$path_to_bowtie, 6109 'path_to_bowtie=s' => \$path_to_bowtie,
5782 'non_bs_mm' => \$non_bs_mm, 6141 'non_bs_mm' => \$non_bs_mm,
5783 'samtools_path=s' => \$samtools_path, 6142 'samtools_path=s' => \$samtools_path,
5784 'bam' => \$bam, 6143 'bam' => \$bam,
5785 'gzip' => \$gzip, 6144 'gzip' => \$gzip,
5786 'pbat' => \$pbat, 6145 'pbat' => \$pbat,
6146 'prefix=s' => \$prefix,
6147 'old_flag' => \$old_flag,
5787 ); 6148 );
5788 6149
5789 6150
5790 ### EXIT ON ERROR if there were errors with any of the supplied options 6151 ### EXIT ON ERROR if there were errors with any of the supplied options
5791 unless ($command_line){ 6152 unless ($command_line){
5937 ### checking the integrity of $CT_dir 6298 ### checking the integrity of $CT_dir
5938 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; 6299 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
5939 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); 6300 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
5940 foreach my $file(@CT_bowtie_index){ 6301 foreach my $file(@CT_bowtie_index){
5941 unless (-f $file){ 6302 unless (-f $file){
5942 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n"; 6303 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n";
5943 } 6304 }
5944 } 6305 }
5945 ### checking the integrity of $GA_dir 6306 ### checking the integrity of $GA_dir
5946 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; 6307 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
5947 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); 6308 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
5948 foreach my $file(@GA_bowtie_index){ 6309 foreach my $file(@GA_bowtie_index){
5949 unless (-f $file){ 6310 unless (-f $file){
5950 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; 6311 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n";
5951 } 6312 }
5952 } 6313 }
5953 } 6314 }
5954 6315
5955 else{ ### Bowtie 1 (default) 6316 else{ ### Bowtie 1 (default)
5956 ### checking the integrity of $CT_dir 6317 ### checking the integrity of $CT_dir
5957 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; 6318 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
5958 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); 6319 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
5959 foreach my $file(@CT_bowtie_index){ 6320 foreach my $file(@CT_bowtie_index){
5960 unless (-f $file){ 6321 unless (-f $file){
5961 die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; 6322 die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
5962 } 6323 }
5963 } 6324 }
5964 ### checking the integrity of $GA_dir 6325 ### checking the integrity of $GA_dir
5965 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; 6326 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
5966 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); 6327 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
5967 foreach my $file(@GA_bowtie_index){ 6328 foreach my $file(@GA_bowtie_index){
5968 unless (-f $file){ 6329 unless (-f $file){
5969 die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; 6330 die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
5970 } 6331 }
5971 } 6332 }
5972 } 6333 }
5973 6334
5974 my $CT_index_basename = "${CT_dir}BS_CT"; 6335 my $CT_index_basename = "${CT_dir}BS_CT";
6245 } 6606 }
6246 if ($bowtie2){ 6607 if ($bowtie2){
6247 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments 6608 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
6248 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones 6609 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
6249 } 6610 }
6611
6612 if ($old_flag){
6613 warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla);
6614 sleep(3);
6615 }
6250 } 6616 }
6251 elsif ($mates2){ 6617 elsif ($mates2){
6252 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n"; 6618 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
6253 } 6619 }
6254 6620
6398 if ($vanilla){ 6764 if ($vanilla){
6399 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; 6765 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n";
6400 } 6766 }
6401 } 6767 }
6402 6768
6403 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat); 6769 ### PREFIX FOR OUTPUT FILES
6770 if ($prefix){
6771 # removing trailing dots
6772
6773 $prefix =~ s/\.+$//;
6774
6775 warn "Using the following prefix for output files: $prefix\n\n";
6776 sleep(1);
6777 }
6778
6779
6780 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag);
6404 } 6781 }
6405 6782
6406 6783
6407 6784
6408 sub generate_SAM_header{ 6785 sub generate_SAM_header{
6636 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; 7013 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
6637 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; 7014 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
6638 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; 7015 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
6639 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; 7016 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
6640 7017
6641 my $id_1 = $id.'/1'; 7018 my $id_1;
6642 my $id_2 = $id.'/2'; 7019 my $id_2;
7020
7021 if ($old_flag){
7022 $id_1 = $id.'/1';
7023 $id_2 = $id.'/2';
7024 }
7025 else{
7026 $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard
7027 $id_2 = $id;
7028 }
6643 7029
6644 # Allows all degenerate nucleotide sequences in reference genome 7030 # Allows all degenerate nucleotide sequences in reference genome
6645 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i; 7031 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
6646 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i; 7032 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
6647 7033
6757 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account 7143 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
6758 7144
6759 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) 7145 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
6760 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) 7146 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
6761 7147
6762 my $flag_1; # FLAG variable used for SAM format 7148 my $flag_1; # FLAG variable used for SAM format
6763 my $flag_2; 7149 my $flag_2;
6764 7150
7151 ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH)
7152
6765 if ($index == 0){ # OT 7153 if ($index == 0){ # OT
6766 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) 7154 unless ($old_flag){
6767 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) 7155 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
7156 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
7157 }
7158 else{
7159 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
7160 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
7161 }
6768 } 7162 }
6769 elsif ($index == 1){ # CTOB 7163 elsif ($index == 1){ # CTOB
6770 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) 7164 unless($old_flag){
6771 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) 7165 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
7166 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
7167 }
7168 else{
7169 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
7170 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
7171 }
6772 } 7172 }
6773 elsif ($index == 2){ # CTOT 7173 elsif ($index == 2){ # CTOT
6774 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) 7174 unless ($old_flag){
6775 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) 7175 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
7176 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
7177 }
7178 else{
7179 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
7180 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
7181 }
6776 } 7182 }
6777 elsif ($index == 3){ # OB 7183 elsif ($index == 3){ # OB
6778 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) 7184 unless ($old_flag){
6779 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) 7185 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
7186 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
7187 }
7188 else{
7189 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
7190 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
7191 }
6780 } 7192 }
6781 7193
6782 ##### 7194 #####
6783 7195
6784 my $mapq = 255; # Mapping quality is unavailable 7196 my $mapq = 255; # Mapping quality is unavailable
6844 # <----------- read 2 read 2 contained within read 1 7256 # <----------- read 2 read 2 contained within read 1
6845 # 7257 #
6846 # or 7258 # or
6847 # 7259 #
6848 # -------------------------> read 1 7260 # -------------------------> read 1
6849 # <----------- read 2 read 2 contained within read 1 7261 # <------------------------ read 2 read 2 contained within read 1
6850 7262
6851 # start and end of read 2 are fully contained within read 1 7263 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable
6852 $tlen_1 = 0; # Set as 0 when the information is unavailable 7264 $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign,
6853 $tlen_2 = 0; # Set as 0 when the information is unavailable 7265 $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this
7266 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
6854 } 7267 }
6855 7268
6856 } 7269 }
6857 7270
6858 elsif ($start_read_2 < $start_read_1){ 7271 elsif ($start_read_2 < $start_read_1){
6880 # <----------- read 1 read 1 contained within read 2 7293 # <----------- read 1 read 1 contained within read 2
6881 # 7294 #
6882 # or 7295 # or
6883 # 7296 #
6884 # -------------------------> read 2 7297 # -------------------------> read 2
6885 # <----------- read 1 read 1 contained within read 2 7298 # <------------------------ read 1 read 1 contained within read 2
6886 7299
6887 # start and end of read 1 are fully contained within read 2 7300 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable
6888 $tlen_1 = 0; # Set as 0 when the information is unavailable 7301 $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign,
6889 $tlen_2 = 0; # Set as 0 when the information is unavailable 7302 $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this
6890 } 7303 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
7304 }
6891 } 7305 }
6892 } 7306 }
6893 7307
6894 else{ # Bowtie 1 7308 else{ # Bowtie 1
6895 7309
7351 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). 7765 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file).
7352 7766
7353 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified 7767 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
7354 explicitly if Samtools is in the PATH already. 7768 explicitly if Samtools is in the PATH already.
7355 7769
7770 --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For
7771 example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc.
7772
7773 --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition,
7774 this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since
7775 both the appended read IDs and custom FLAG values may cause problems with some downstream tools
7776 such as Picard, new defaults were implemented as of version 0.8.3.
7777
7778
7779 default old_flag
7780 =================== ===================
7781 Read 1 Read 2 Read 1 Read 2
7782
7783 OT: 99 147 67 131
7784
7785 OB: 83 163 115 179
7786
7787 CTOT: 99 147 67 131
7788
7789 CTOB: 83 163 115 179
7790
7356 7791
7357 7792
7358 Other: 7793 Other:
7359 7794
7360 -h/--help Displays this help file. 7795 -h/--help Displays this help file.
7516 (17) XA/XB-tag (non-bisulfite mismatches) (optional!) 7951 (17) XA/XB-tag (non-bisulfite mismatches) (optional!)
7517 7952
7518 Each read of paired-end alignments is written out in a separate line in the above format. 7953 Each read of paired-end alignments is written out in a separate line in the above format.
7519 7954
7520 7955
7521 Last edited on 10 May 2013. 7956 Last edited on 07 October 2013.
7522 7957
7523 HOW_TO 7958 HOW_TO
7524 } 7959 }