Mercurial > repos > bgruening > bismark
comparison bismark @ 3:91f07ff056ca draft
Uploaded
author | bgruening |
---|---|
date | Mon, 14 Apr 2014 16:43:14 -0400 |
parents | 62c6da72dd4a |
children |
comparison
equal
deleted
inserted
replaced
2:82814a8a2395 | 3:91f07ff056ca |
---|---|
22 ## You should have received a copy of the GNU General Public License | 22 ## You should have received a copy of the GNU General Public License |
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | 23 ## along with this program. If not, see <http://www.gnu.org/licenses/>. |
24 | 24 |
25 | 25 |
26 my $parent_dir = getcwd; | 26 my $parent_dir = getcwd; |
27 my $bismark_version = 'v0.7.12'; | 27 my $bismark_version = 'v0.10.0'; |
28 my $command_line = join (" ",@ARGV); | 28 my $command_line = join (" ",@ARGV); |
29 | 29 |
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail | 30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail |
31 foreach my $arg (@ARGV){ | 31 foreach my $arg (@ARGV){ |
32 if ($arg eq '--solexa1.3-quals'){ | 32 if ($arg eq '--solexa1.3-quals'){ |
33 $arg = '--phred64-quals'; | 33 $arg = '--phred64-quals'; |
34 } | 34 } |
35 } | 35 } |
36 my @filenames; # will be populated by processing the command line | 36 my @filenames; # will be populated by processing the command line |
37 | 37 |
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat) = process_command_line(); | 38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line(); |
39 | 39 |
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment | 40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment |
41 my %chromosomes; # stores the chromosome sequences of the mouse genome | 41 my %chromosomes; # stores the chromosome sequences of the mouse genome |
42 my %counting; # counting various events | 42 my %counting; # counting various events |
43 | 43 |
291 $filename = $sequence_file; | 291 $filename = $sequence_file; |
292 } | 292 } |
293 | 293 |
294 ### printing all alignments to a results file | 294 ### printing all alignments to a results file |
295 my $outfile = $filename; | 295 my $outfile = $filename; |
296 if ($prefix){ | |
297 $outfile = "$prefix.$outfile"; | |
298 } | |
299 | |
296 | 300 |
297 if ($bowtie2){ # SAM format is the default for Bowtie 2 | 301 if ($bowtie2){ # SAM format is the default for Bowtie 2 |
298 $outfile =~ s/$/_bt2_bismark.sam/; | 302 $outfile =~ s/$/_bismark_bt2.sam/; |
299 } | 303 } |
300 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) | 304 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) |
301 $outfile =~ s/$/_bismark.txt/; | 305 $outfile =~ s/$/_bismark.txt/; |
302 } | 306 } |
303 else{ # SAM is the default output | 307 else{ # SAM is the default output |
325 print OUT "Bismark version: $bismark_version\n"; | 329 print OUT "Bismark version: $bismark_version\n"; |
326 } | 330 } |
327 | 331 |
328 ### printing alignment and methylation call summary to a report file | 332 ### printing alignment and methylation call summary to a report file |
329 my $reportfile = $filename; | 333 my $reportfile = $filename; |
334 if ($prefix){ | |
335 $reportfile = "$prefix.$reportfile"; | |
336 } | |
337 | |
330 if ($bowtie2){ | 338 if ($bowtie2){ |
331 $reportfile =~ s/$/_bt2_bismark_SE_report.txt/; | 339 $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; |
332 } | 340 } |
333 else{ | 341 else{ |
334 $reportfile =~ s/$/_bismark_SE_report.txt/; | 342 $reportfile =~ s/$/_bismark_SE_report.txt/; |
335 } | 343 } |
336 | 344 |
337 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; | 345 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; |
338 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; | 346 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; |
339 | 347 |
340 if ($unmapped){ | 348 if ($unmapped){ |
341 my $unmapped_file = $filename; | 349 my $unmapped_file = $filename; |
350 if ($prefix){ | |
351 $unmapped_file = "$prefix.$unmapped_file"; | |
352 } | |
353 | |
342 $unmapped_file =~ s/$/_unmapped_reads.txt/; | 354 $unmapped_file =~ s/$/_unmapped_reads.txt/; |
343 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; | 355 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; |
344 print "Unmapped sequences will be written to $output_dir$unmapped_file\n"; | 356 print "Unmapped sequences will be written to $output_dir$unmapped_file\n"; |
345 } | 357 } |
346 if ($ambiguous){ | 358 if ($ambiguous){ |
347 my $ambiguous_file = $filename; | 359 my $ambiguous_file = $filename; |
360 if ($prefix){ | |
361 $ambiguous_file = "$prefix.$ambiguous_file"; | |
362 } | |
348 $ambiguous_file =~ s/$/_ambiguous_reads.txt/; | 363 $ambiguous_file =~ s/$/_ambiguous_reads.txt/; |
349 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; | 364 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; |
350 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; | 365 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; |
351 } | 366 } |
352 | 367 |
397 else{ | 412 else{ |
398 $filename_2 = $sequence_file_2; | 413 $filename_2 = $sequence_file_2; |
399 } | 414 } |
400 | 415 |
401 ### printing all alignments to a results file | 416 ### printing all alignments to a results file |
402 my $outfile = $filename_1; | 417 my $outfile = $filename_1; |
418 | |
419 if ($prefix){ | |
420 $outfile = "$prefix.$outfile"; | |
421 } | |
422 | |
403 if ($bowtie2){ # SAM format is the default Bowtie 2 output | 423 if ($bowtie2){ # SAM format is the default Bowtie 2 output |
404 $outfile =~ s/$/_bismark_bt2_pe.sam/; | 424 $outfile =~ s/$/_bismark_bt2_pe.sam/; |
405 } | 425 } |
406 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) | 426 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) |
407 $outfile =~ s/$/_bismark_pe.txt/; | 427 $outfile =~ s/$/_bismark_pe.txt/; |
431 print OUT "Bismark version: $bismark_version\n"; | 451 print OUT "Bismark version: $bismark_version\n"; |
432 } | 452 } |
433 | 453 |
434 ### printing alignment and methylation call summary to a report file | 454 ### printing alignment and methylation call summary to a report file |
435 my $reportfile = $filename_1; | 455 my $reportfile = $filename_1; |
456 if ($prefix){ | |
457 $reportfile = "$prefix.$reportfile"; | |
458 } | |
459 | |
436 if ($bowtie2){ | 460 if ($bowtie2){ |
437 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; | 461 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; |
438 } | 462 } |
439 else{ | 463 else{ |
440 $reportfile =~ s/$/_bismark_PE_report.txt/; | 464 $reportfile =~ s/$/_bismark_PE_report.txt/; |
447 | 471 |
448 ### Unmapped read output | 472 ### Unmapped read output |
449 if ($unmapped){ | 473 if ($unmapped){ |
450 my $unmapped_1 = $filename_1; | 474 my $unmapped_1 = $filename_1; |
451 my $unmapped_2 = $filename_2; | 475 my $unmapped_2 = $filename_2; |
476 if ($prefix){ | |
477 $unmapped_1 = "$prefix.$unmapped_1"; | |
478 $unmapped_2 = "$prefix.$unmapped_2"; | |
479 } | |
452 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/; | 480 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/; |
453 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/; | 481 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/; |
454 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; | 482 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; |
455 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; | 483 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; |
456 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; | 484 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; |
457 } | 485 } |
458 | 486 |
459 if ($ambiguous){ | 487 if ($ambiguous){ |
460 my $amb_1 = $filename_1; | 488 my $amb_1 = $filename_1; |
461 my $amb_2 = $filename_2; | 489 my $amb_2 = $filename_2; |
490 if ($prefix){ | |
491 $amb_1 = "$prefix.$amb_1"; | |
492 $amb_2 = "$prefix.$amb_2"; | |
493 } | |
494 | |
462 $amb_1 =~ s/$/_ambiguous_reads_1.txt/; | 495 $amb_1 =~ s/$/_ambiguous_reads_1.txt/; |
463 $amb_2 =~ s/$/_ambiguous_reads_2.txt/; | 496 $amb_2 =~ s/$/_ambiguous_reads_2.txt/; |
464 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; | 497 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; |
465 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; | 498 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; |
466 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; | 499 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; |
576 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; | 609 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; |
577 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; | 610 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; |
578 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; | 611 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; |
579 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | 612 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; |
580 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | 613 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; |
581 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | 614 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; |
582 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | 615 if ($bowtie2){ |
583 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | 616 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; |
584 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | 617 } |
618 warn "\n"; | |
619 | |
620 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
621 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
622 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
623 if ($bowtie2){ | |
624 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
625 } | |
626 warn "\n"; | |
585 | 627 |
586 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | 628 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; |
587 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | 629 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; |
588 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n"; | 630 |
631 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
589 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | 632 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; |
590 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | 633 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; |
591 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | 634 if ($bowtie2){ |
592 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | 635 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; |
593 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | 636 } |
637 print REPORT "\n"; | |
638 | |
639 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
640 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
641 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
642 if ($bowtie2){ | |
643 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
644 } | |
645 print REPORT "\n"; | |
594 | 646 |
595 my $percent_meCHG; | 647 my $percent_meCHG; |
596 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | 648 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ |
597 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | 649 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); |
598 } | 650 } |
604 | 656 |
605 my $percent_meCpG; | 657 my $percent_meCpG; |
606 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | 658 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ |
607 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | 659 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); |
608 } | 660 } |
661 | |
662 my $percent_meC_unknown; | |
663 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ | |
664 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); | |
665 } | |
666 | |
609 | 667 |
610 ### printing methylated CpG percentage if applicable | 668 ### printing methylated CpG percentage if applicable |
611 if ($percent_meCpG){ | 669 if ($percent_meCpG){ |
612 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; | 670 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; |
613 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | 671 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; |
627 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | 685 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; |
628 } | 686 } |
629 | 687 |
630 ### printing methylated C percentage (CHH context) if applicable | 688 ### printing methylated C percentage (CHH context) if applicable |
631 if ($percent_meCHH){ | 689 if ($percent_meCHH){ |
632 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | 690 warn "C methylated in CHH context:\t${percent_meCHH}%\n"; |
633 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | 691 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; |
634 } | 692 } |
635 else{ | 693 else{ |
636 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | 694 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; |
637 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | 695 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; |
638 } | 696 } |
697 | |
698 ### printing methylated C percentage (Unknown C context) if applicable | |
699 if ($bowtie2){ | |
700 if ($percent_meC_unknown){ | |
701 warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
702 print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
703 } | |
704 else{ | |
705 warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; | |
706 print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; | |
707 } | |
708 } | |
709 print REPORT "\n\n"; | |
710 warn "\n\n"; | |
639 | 711 |
640 if ($seqID_contains_tabs){ | 712 if ($seqID_contains_tabs){ |
641 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; | 713 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; |
642 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; | 714 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; |
643 } | 715 } |
716 | |
717 | |
718 ########################################################################################################################################### | |
719 ### create pie-chart with mapping stats | |
720 ########################################################################################################################################### | |
721 | |
722 | |
723 my $filename; | |
724 if ($pbat){ | |
725 $filename = $G_to_A_infile; | |
726 } | |
727 else{ | |
728 $filename = $C_to_T_infile; | |
729 } | |
730 | |
731 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
732 $pie_chart =~ s/gz$//; | |
733 $pie_chart =~ s/_C_to_T\.fastq$//; | |
734 $pie_chart =~ s/_G_to_A\.fastq$//; | |
735 | |
736 # if ($prefix){ | |
737 # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation | |
738 # } | |
739 $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png"; | |
740 | |
741 | |
742 #Check whether the module GD::Graph is installed | |
743 my $gd_graph_installed = 0; | |
744 eval{ | |
745 require GD::Graph::pie; | |
746 GD::Graph::pie->import(); | |
747 }; | |
748 | |
749 unless($@) { | |
750 $gd_graph_installed = 1; | |
751 } | |
752 else{ | |
753 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; | |
754 sleep(2); | |
755 } | |
756 | |
757 if ($gd_graph_installed){ | |
758 warn "Generating pie chart\n\n"; | |
759 sleep(1); | |
760 my $graph = GD::Graph::pie->new(600,600); | |
761 | |
762 my $percent_unaligned; | |
763 my $percent_multiple; | |
764 my $percent_unextractable; | |
765 | |
766 if ($counting{sequences_count}){ | |
767 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); | |
768 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); | |
769 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); | |
770 } | |
771 else{ | |
772 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; | |
773 } | |
774 | |
775 my @aln_stats = ( | |
776 ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], | |
777 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], | |
778 ); | |
779 | |
780 $graph->set( | |
781 start_angle => 180, | |
782 '3d' => 0, | |
783 label => 'Alignment stats (single-end)', | |
784 suppress_angle => 2, # Only label slices of sufficient size | |
785 transparent => 0, | |
786 dclrs => [ qw(red lorange dgreen cyan) ], | |
787 ) or die $graph->error; | |
788 | |
789 my $gd = $graph->plot(\@aln_stats) or die $graph->error; | |
790 | |
791 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; | |
792 binmode PIE; | |
793 print PIE $gd->png; | |
794 } | |
795 | |
796 warn "====================\nBismark run complete\n====================\n\n"; | |
797 | |
644 } | 798 } |
799 | |
645 | 800 |
646 sub print_final_analysis_report_paired_ends{ | 801 sub print_final_analysis_report_paired_ends{ |
647 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | 802 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; |
648 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles | 803 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles |
649 if ($directional){ | 804 if ($directional){ |
734 | 889 |
735 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | 890 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; |
736 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; | 891 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; |
737 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | 892 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; |
738 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | 893 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; |
739 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | 894 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; |
740 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | 895 if ($bowtie2){ |
741 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | 896 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; |
742 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | 897 } |
898 warn "\n"; | |
899 | |
900 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
901 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
902 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
903 if ($bowtie2){ | |
904 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
905 } | |
906 warn "\n"; | |
743 | 907 |
744 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | 908 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; |
745 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | 909 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; |
746 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | 910 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; |
747 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n"; | 911 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; |
748 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | 912 if ($bowtie2){ |
749 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | 913 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; |
750 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n"; | 914 } |
915 print REPORT "\n"; | |
916 | |
917 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
918 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
919 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
920 if ($bowtie2){ | |
921 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; | |
922 } | |
923 print REPORT "\n"; | |
751 | 924 |
752 my $percent_meCHG; | 925 my $percent_meCHG; |
753 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | 926 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ |
754 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | 927 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); |
755 } | 928 } |
761 | 934 |
762 my $percent_meCpG; | 935 my $percent_meCpG; |
763 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | 936 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ |
764 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | 937 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); |
765 } | 938 } |
939 | |
940 my $percent_meC_unknown; | |
941 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ | |
942 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); | |
943 } | |
944 | |
766 | 945 |
767 ### printing methylated CpG percentage if applicable | 946 ### printing methylated CpG percentage if applicable |
768 if ($percent_meCpG){ | 947 if ($percent_meCpG){ |
769 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; | 948 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; |
770 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | 949 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; |
784 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | 963 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; |
785 } | 964 } |
786 | 965 |
787 ### printing methylated C percentage in CHH context if applicable | 966 ### printing methylated C percentage in CHH context if applicable |
788 if ($percent_meCHH){ | 967 if ($percent_meCHH){ |
789 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | 968 warn "C methylated in CHH context:\t${percent_meCHH}%\n"; |
790 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n"; | 969 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; |
791 } | 970 } |
792 else{ | 971 else{ |
793 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | 972 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; |
794 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n"; | 973 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; |
795 } | 974 } |
975 | |
976 ### printing methylated C percentage (Unknown C context) if applicable | |
977 if ($bowtie2){ | |
978 if ($percent_meC_unknown){ | |
979 warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
980 print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
981 } | |
982 else{ | |
983 warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; | |
984 print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; | |
985 } | |
986 } | |
987 print REPORT "\n\n"; | |
988 warn "\n\n"; | |
989 | |
990 | |
991 ############################################################################################################################################ | |
992 ### create pie-chart with mapping stats | |
993 ########################################################################################################################################### | |
994 | |
995 my $filename; | |
996 if ($pbat){ | |
997 $filename = $G_to_A_infile_1; | |
998 } | |
999 else{ | |
1000 $filename = $C_to_T_infile_1; | |
1001 } | |
1002 | |
1003 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
1004 $pie_chart =~ s/gz$//; | |
1005 $pie_chart =~ s/_C_to_T.fastq$//; | |
1006 $pie_chart =~ s/_G_to_A.fastq$//; | |
1007 ### special format for gzipped PE Bowtie1 files | |
1008 $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//; | |
1009 $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//; | |
1010 | |
1011 if ($prefix){ | |
1012 # prefix is now being prepended to the temp files already | |
1013 # $pie_chart = "$prefix.$pie_chart"; | |
1014 } | |
1015 $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png"; | |
1016 | |
1017 #Check whether the module GD::Graph is installed | |
1018 my $gd_graph_installed = 0; | |
1019 eval{ | |
1020 require GD::Graph::pie; | |
1021 GD::Graph::pie->import(); | |
1022 }; | |
1023 | |
1024 unless($@) { | |
1025 $gd_graph_installed = 1; | |
1026 } | |
1027 else{ | |
1028 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; | |
1029 sleep(2); | |
1030 } | |
1031 | |
1032 if ($gd_graph_installed){ | |
1033 warn "Generating pie chart\n\n"; | |
1034 sleep(1); | |
1035 my $graph = GD::Graph::pie->new(600,600); | |
1036 | |
1037 my $percent_unaligned; | |
1038 my $percent_multiple; | |
1039 my $percent_unextractable; | |
1040 | |
1041 if ($counting{sequences_count}){ | |
1042 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); | |
1043 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); | |
1044 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); | |
1045 } | |
1046 else{ | |
1047 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; | |
1048 } | |
1049 | |
1050 my @aln_stats = ( | |
1051 ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], | |
1052 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], | |
1053 ); | |
1054 | |
1055 # push @{$mbias_read1[0]},$pos; | |
1056 | |
1057 $graph->set( | |
1058 start_angle => 180, | |
1059 '3d' => 0, | |
1060 label => 'Alignment stats (paired-end)', | |
1061 suppress_angle => 2, # Only label slices of sufficient size | |
1062 transparent => 0, | |
1063 dclrs => [ qw(red lorange dgreen cyan) ], | |
1064 ) or die $graph->error; | |
1065 | |
1066 my $gd = $graph->plot(\@aln_stats) or die $graph->error; | |
1067 | |
1068 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; | |
1069 binmode PIE; | |
1070 print PIE $gd->png; | |
1071 } | |
1072 | |
1073 warn "====================\nBismark run complete\n====================\n\n"; | |
796 | 1074 |
797 } | 1075 } |
798 | 1076 |
799 sub process_single_end_fastA_file_for_methylation_call{ | 1077 sub process_single_end_fastA_file_for_methylation_call{ |
800 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; | 1078 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; |
829 if ($upto){ | 1107 if ($upto){ |
830 last if ($count > $upto); | 1108 last if ($count > $upto); |
831 } | 1109 } |
832 | 1110 |
833 $counting{sequences_count}++; | 1111 $counting{sequences_count}++; |
834 if ($counting{sequences_count}%100000==0) { | 1112 if ($counting{sequences_count}%1000000==0) { |
835 warn "Processed $counting{sequences_count} sequences so far\n"; | 1113 warn "Processed $counting{sequences_count} sequences so far\n"; |
836 } | 1114 } |
837 chomp $sequence; | 1115 chomp $sequence; |
838 chomp $identifier; | 1116 chomp $identifier; |
839 | 1117 |
991 if ($upto){ | 1269 if ($upto){ |
992 last if ($count > $upto); | 1270 last if ($count > $upto); |
993 } | 1271 } |
994 | 1272 |
995 $counting{sequences_count}++; | 1273 $counting{sequences_count}++; |
996 if ($counting{sequences_count}%100000==0) { | 1274 if ($counting{sequences_count}%1000000==0) { |
997 warn "Processed $counting{sequences_count} sequences so far\n"; | 1275 warn "Processed $counting{sequences_count} sequence pairs so far\n"; |
998 } | 1276 } |
999 my $orig_identifier_1 = $identifier_1; | 1277 my $orig_identifier_1 = $identifier_1; |
1000 my $orig_identifier_2 = $identifier_2; | 1278 my $orig_identifier_2 = $identifier_2; |
1001 | 1279 |
1002 chomp $sequence_1; | 1280 chomp $sequence_1; |
1088 if ($upto){ | 1366 if ($upto){ |
1089 last if ($count > $upto); | 1367 last if ($count > $upto); |
1090 } | 1368 } |
1091 | 1369 |
1092 $counting{sequences_count}++; | 1370 $counting{sequences_count}++; |
1093 if ($counting{sequences_count}%100000==0) { | 1371 if ($counting{sequences_count}%1000000==0) { |
1094 warn "Processed $counting{sequences_count} sequences so far\n"; | 1372 warn "Processed $counting{sequences_count} sequence pairs so far\n"; |
1095 } | 1373 } |
1096 | 1374 |
1097 my $orig_identifier_1 = $identifier_1; | 1375 my $orig_identifier_1 = $identifier_1; |
1098 my $orig_identifier_2 = $identifier_2; | 1376 my $orig_identifier_2 = $identifier_2; |
1099 | 1377 |
2776 $counting{unique_best_alignment_count}++; | 3054 $counting{unique_best_alignment_count}++; |
2777 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); | 3055 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); |
2778 | 3056 |
2779 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call | 3057 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call |
2780 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ | 3058 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ |
2781 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; | 3059 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; |
2782 $counting{genomic_sequence_could_not_be_extracted_count}++; | 3060 $counting{genomic_sequence_could_not_be_extracted_count}++; |
2783 return 0; | 3061 return 0; |
2784 } | 3062 } |
2785 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ | 3063 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ |
2786 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; | 3064 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; |
2787 $counting{genomic_sequence_could_not_be_extracted_count}++; | 3065 $counting{genomic_sequence_could_not_be_extracted_count}++; |
2788 return 0; | 3066 return 0; |
2789 } | 3067 } |
2790 | 3068 |
2791 ### now we are set to perform the actual methylation call | 3069 ### now we are set to perform the actual methylation call |
3075 | 3353 |
3076 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; | 3354 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; |
3077 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; | 3355 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; |
3078 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; | 3356 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; |
3079 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; | 3357 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; |
3080 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; | 3358 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; |
3359 # sleep(10); | |
3081 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and | 3360 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and |
3082 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence | 3361 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence |
3083 | 3362 |
3084 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | 3363 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, |
3085 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | 3364 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions |
3109 shift @ops_2; # remove the empty first element | 3388 shift @ops_2; # remove the empty first element |
3110 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); | 3389 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); |
3111 | 3390 |
3112 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output | 3391 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output |
3113 my $indels_2 = 0; | 3392 my $indels_2 = 0; |
3114 | 3393 |
3115 ### Extracting read 1 genomic sequence ### | 3394 ### Extracting read 1 genomic sequence ### |
3116 | 3395 |
3117 # extracting 2 additional bp at the 5' end (read 1) | 3396 # extracting 2 additional bp at the 5' end (read 1) |
3118 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ | 3397 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ |
3119 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | 3398 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome |
3158 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | 3437 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome |
3159 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise | 3438 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise |
3160 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | 3439 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; |
3161 return; | 3440 return; |
3162 } | 3441 } |
3442 | |
3163 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); | 3443 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); |
3164 } | 3444 } |
3165 | 3445 |
3166 | 3446 |
3167 ### Extracting read 2 genomic sequence ### | 3447 ### Extracting read 2 genomic sequence ### |
3207 | 3487 |
3208 ### 3' end of read 2 | 3488 ### 3' end of read 2 |
3209 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ | 3489 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ |
3210 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | 3490 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome |
3211 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise | 3491 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise |
3492 # need to set read 1 as well now to prevent warning | |
3493 # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; | |
3494 # sleep(5); | |
3495 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
3212 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | 3496 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; |
3213 return; | 3497 return; |
3214 } | 3498 } |
3215 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); | 3499 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); |
3216 } | 3500 } |
3807 ### x for not methylated C in CHG context (was converted) ### | 4091 ### x for not methylated C in CHG context (was converted) ### |
3808 ### H for methylated C in CHH context (was protected) ### | 4092 ### H for methylated C in CHH context (was protected) ### |
3809 ### h for not methylated C in CHH context (was converted) ### | 4093 ### h for not methylated C in CHH context (was converted) ### |
3810 ### Z for methylated C in CpG context (was protected) ### | 4094 ### Z for methylated C in CpG context (was protected) ### |
3811 ### z for not methylated C in CpG context (was converted) ### | 4095 ### z for not methylated C in CpG context (was converted) ### |
4096 ### U for methylated C in unknown context (was protected) ### | |
4097 ### u for not methylated C in unknwon context (was converted) ### | |
3812 ################################################################# | 4098 ################################################################# |
3813 | 4099 |
3814 my @match =(); | 4100 my @match =(); |
3815 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 | 4101 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 |
3816 my $methyl_CHH_count = 0; | 4102 my $methyl_CHH_count = 0; |
3817 my $methyl_CHG_count = 0; | 4103 my $methyl_CHG_count = 0; |
3818 my $methyl_CpG_count = 0; | 4104 my $methyl_CpG_count = 0; |
4105 my $methyl_C_unknown_count = 0; | |
3819 my $unmethylated_CHH_count = 0; | 4106 my $unmethylated_CHH_count = 0; |
3820 my $unmethylated_CHG_count = 0; | 4107 my $unmethylated_CHG_count = 0; |
3821 my $unmethylated_CpG_count = 0; | 4108 my $unmethylated_CpG_count = 0; |
4109 my $unmethylated_C_unknown_count = 0; | |
3822 | 4110 |
3823 if ($read_conversion eq 'CT'){ | 4111 if ($read_conversion eq 'CT'){ |
3824 for my $index (0..$#seq) { | 4112 for my $index (0..$#seq) { |
3825 if ($seq[$index] eq $genomic[$index]) { | 4113 if ($seq[$index] eq $genomic[$index]) { |
3826 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation | 4114 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation |
3830 | 4118 |
3831 if ($downstream_base eq 'G'){ | 4119 if ($downstream_base eq 'G'){ |
3832 ++$methyl_CpG_count; | 4120 ++$methyl_CpG_count; |
3833 push @match,'Z'; # protected C, methylated, in CpG context | 4121 push @match,'Z'; # protected C, methylated, in CpG context |
3834 } | 4122 } |
3835 | 4123 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) |
4124 ++$methyl_C_unknown_count; | |
4125 push @match,'U'; # protected C, methylated, in Unknown context | |
4126 } | |
3836 else { | 4127 else { |
3837 ### C in not in CpG-context, determining the second downstream base context | 4128 ### C in not in CpG-context, determining the second downstream base context |
3838 my $second_downstream_base = $genomic[$index+2]; | 4129 my $second_downstream_base = $genomic[$index+2]; |
3839 | 4130 |
3840 if ($second_downstream_base eq 'G'){ | 4131 if ($second_downstream_base eq 'G'){ |
3841 ++$methyl_CHG_count; | 4132 ++$methyl_CHG_count; |
3842 push @match,'X'; # protected C, methylated, in CHG context | 4133 push @match,'X'; # protected C, methylated, in CHG context |
4134 } | |
4135 elsif ($second_downstream_base eq 'N'){ | |
4136 ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
4137 push @match,'U'; # protected C, methylated, in Unknown context | |
3843 } | 4138 } |
3844 else{ | 4139 else{ |
3845 ++$methyl_CHH_count; | 4140 ++$methyl_CHH_count; |
3846 push @match,'H'; # protected C, methylated, in CHH context | 4141 push @match,'H'; # protected C, methylated, in CHH context |
3847 } | 4142 } |
3860 | 4155 |
3861 if ($downstream_base eq 'G'){ | 4156 if ($downstream_base eq 'G'){ |
3862 ++$unmethylated_CpG_count; | 4157 ++$unmethylated_CpG_count; |
3863 push @match,'z'; # converted C, not methylated, in CpG context | 4158 push @match,'z'; # converted C, not methylated, in CpG context |
3864 } | 4159 } |
3865 | 4160 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) |
4161 ++$unmethylated_C_unknown_count; | |
4162 push @match,'u'; # converted C, not methylated, in Unknown context | |
4163 } | |
3866 else{ | 4164 else{ |
3867 ### C in not in CpG-context, determining the second downstream base context | 4165 ### C in not in CpG-context, determining the second downstream base context |
3868 my $second_downstream_base = $genomic[$index+2]; | 4166 my $second_downstream_base = $genomic[$index+2]; |
3869 | 4167 |
3870 if ($second_downstream_base eq 'G'){ | 4168 if ($second_downstream_base eq 'G'){ |
3871 ++$unmethylated_CHG_count; | 4169 ++$unmethylated_CHG_count; |
3872 push @match,'x'; # converted C, not methylated, in CHG context | 4170 push @match,'x'; # converted C, not methylated, in CHG context |
4171 } | |
4172 elsif ($second_downstream_base eq 'N'){ | |
4173 ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
4174 push @match,'u'; # converted C, not methylated, in Unknown context | |
3873 } | 4175 } |
3874 else{ | 4176 else{ |
3875 ++$unmethylated_CHH_count; | 4177 ++$unmethylated_CHH_count; |
3876 push @match,'h'; # converted C, not methylated, in CHH context | 4178 push @match,'h'; # converted C, not methylated, in CHH context |
3877 } | 4179 } |
3901 | 4203 |
3902 if ($upstream_base eq 'C'){ | 4204 if ($upstream_base eq 'C'){ |
3903 ++$methyl_CpG_count; | 4205 ++$methyl_CpG_count; |
3904 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context | 4206 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context |
3905 } | 4207 } |
3906 | 4208 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) |
4209 ++$methyl_C_unknown_count; | |
4210 push @match,'U'; # protected C on opposing strand, methylated, in Unknown context | |
4211 } | |
3907 else{ | 4212 else{ |
3908 ### C in not in CpG-context, determining the second upstream base context | 4213 ### C in not in CpG-context, determining the second upstream base context |
3909 my $second_upstream_base = $genomic[$index]; | 4214 my $second_upstream_base = $genomic[$index]; |
3910 | 4215 |
3911 if ($second_upstream_base eq 'C'){ | 4216 if ($second_upstream_base eq 'C'){ |
3912 ++$methyl_CHG_count; | 4217 ++$methyl_CHG_count; |
3913 push @match,'X'; # protected C on opposing strand, methylated, in CHG context | 4218 push @match,'X'; # protected C on opposing strand, methylated, in CHG context |
4219 } | |
4220 elsif ($second_upstream_base eq 'N'){ | |
4221 ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
4222 push @match,'U'; # protected C, methylated, in Unknown context | |
3914 } | 4223 } |
3915 else{ | 4224 else{ |
3916 ++$methyl_CHH_count; | 4225 ++$methyl_CHH_count; |
3917 push @match,'H'; # protected C on opposing strand, methylated, in CHH context | 4226 push @match,'H'; # protected C on opposing strand, methylated, in CHH context |
3918 } | 4227 } |
3933 | 4242 |
3934 if ($upstream_base eq 'C'){ | 4243 if ($upstream_base eq 'C'){ |
3935 ++$unmethylated_CpG_count; | 4244 ++$unmethylated_CpG_count; |
3936 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context | 4245 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context |
3937 } | 4246 } |
3938 | 4247 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) |
4248 ++$unmethylated_C_unknown_count; | |
4249 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context | |
4250 } | |
3939 else{ | 4251 else{ |
3940 ### C in not in CpG-context, determining the second upstream base context | 4252 ### C in not in CpG-context, determining the second upstream base context |
3941 my $second_upstream_base = $genomic[$index]; | 4253 my $second_upstream_base = $genomic[$index]; |
3942 | 4254 |
3943 if ($second_upstream_base eq 'C'){ | 4255 if ($second_upstream_base eq 'C'){ |
3944 ++$unmethylated_CHG_count; | 4256 ++$unmethylated_CHG_count; |
3945 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context | 4257 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context |
4258 } | |
4259 elsif ($second_upstream_base eq 'N'){ | |
4260 ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
4261 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context | |
3946 } | 4262 } |
3947 else{ | 4263 else{ |
3948 ++$unmethylated_CHH_count; | 4264 ++$unmethylated_CHH_count; |
3949 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context | 4265 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context |
3950 } | 4266 } |
3967 my $methylation_call = join ("",@match); | 4283 my $methylation_call = join ("",@match); |
3968 | 4284 |
3969 $counting{total_meCHH_count} += $methyl_CHH_count; | 4285 $counting{total_meCHH_count} += $methyl_CHH_count; |
3970 $counting{total_meCHG_count} += $methyl_CHG_count; | 4286 $counting{total_meCHG_count} += $methyl_CHG_count; |
3971 $counting{total_meCpG_count} += $methyl_CpG_count; | 4287 $counting{total_meCpG_count} += $methyl_CpG_count; |
4288 $counting{total_meC_unknown_count} += $methyl_C_unknown_count; | |
3972 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; | 4289 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; |
3973 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; | 4290 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; |
3974 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; | 4291 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; |
4292 $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; | |
3975 | 4293 |
3976 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; | 4294 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; |
3977 return $methylation_call; | 4295 return $methylation_call; |
3978 } | 4296 } |
3979 | 4297 |
4104 else{ | 4422 else{ |
4105 $C_to_T_infile =~ s/$/_C_to_T.fa/; | 4423 $C_to_T_infile =~ s/$/_C_to_T.fa/; |
4106 $G_to_A_infile =~ s/$/_G_to_A.fa/; | 4424 $G_to_A_infile =~ s/$/_G_to_A.fa/; |
4107 } | 4425 } |
4108 | 4426 |
4427 if ($prefix){ | |
4428 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
4429 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
4430 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
4431 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
4432 } | |
4433 | |
4109 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | 4434 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; |
4110 | 4435 |
4111 if ($gzip){ | 4436 if ($gzip){ |
4112 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | 4437 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; |
4113 } | 4438 } |
4207 warn "Processing reads up to sequence no. $upto from $file\n"; | 4532 warn "Processing reads up to sequence no. $upto from $file\n"; |
4208 sleep (1); | 4533 sleep (1); |
4209 } | 4534 } |
4210 | 4535 |
4211 my $C_to_T_infile = my $G_to_A_infile = $filename; | 4536 my $C_to_T_infile = my $G_to_A_infile = $filename; |
4537 | |
4212 $C_to_T_infile =~ s/$/_C_to_T.fa/; | 4538 $C_to_T_infile =~ s/$/_C_to_T.fa/; |
4213 $G_to_A_infile =~ s/$/_G_to_A.fa/; | 4539 $G_to_A_infile =~ s/$/_G_to_A.fa/; |
4540 | |
4541 if ($prefix){ | |
4542 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
4543 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
4544 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
4545 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
4546 } | |
4214 | 4547 |
4215 if ($directional){ | 4548 if ($directional){ |
4216 if ($read_number == 1){ | 4549 if ($read_number == 1){ |
4217 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | 4550 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; |
4218 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | 4551 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; |
4351 warn "Processing reads up to sequence no. $upto from $file\n"; | 4684 warn "Processing reads up to sequence no. $upto from $file\n"; |
4352 sleep (1); | 4685 sleep (1); |
4353 } | 4686 } |
4354 | 4687 |
4355 my $C_to_T_infile = my $G_to_A_infile = $filename; | 4688 my $C_to_T_infile = my $G_to_A_infile = $filename; |
4689 | |
4690 if ($prefix){ | |
4691 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
4692 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
4693 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
4694 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
4695 } | |
4356 | 4696 |
4357 if ($pbat){ # PBAT-Seq | 4697 if ($pbat){ # PBAT-Seq |
4358 if ($gzip){ | 4698 if ($gzip){ |
4359 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | 4699 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; |
4360 } | 4700 } |
4511 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | 4851 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; |
4512 } | 4852 } |
4513 else{ | 4853 else{ |
4514 $C_to_T_infile =~ s/$/_C_to_T.fastq/; | 4854 $C_to_T_infile =~ s/$/_C_to_T.fastq/; |
4515 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | 4855 $G_to_A_infile =~ s/$/_G_to_A.fastq/; |
4856 } | |
4857 | |
4858 if ($prefix){ | |
4859 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
4860 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
4861 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
4862 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
4516 } | 4863 } |
4517 | 4864 |
4518 if ($directional){ | 4865 if ($directional){ |
4519 if ($read_number == 1){ | 4866 if ($read_number == 1){ |
4520 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | 4867 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; |
4683 sleep (1); | 5030 sleep (1); |
4684 } | 5031 } |
4685 | 5032 |
4686 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; | 5033 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; |
4687 | 5034 |
5035 if ($prefix){ | |
5036 # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; | |
5037 $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; | |
5038 $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; | |
5039 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; | |
5040 } | |
5041 | |
4688 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; | 5042 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; |
4689 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; | 5043 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; |
5044 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; | |
4690 | 5045 |
4691 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; | 5046 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; |
4692 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; | 5047 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; |
4693 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; | 5048 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; |
4694 | 5049 |
5555 my $filename = shift; | 5910 my $filename = shift; |
5556 %counting=( | 5911 %counting=( |
5557 total_meCHH_count => 0, | 5912 total_meCHH_count => 0, |
5558 total_meCHG_count => 0, | 5913 total_meCHG_count => 0, |
5559 total_meCpG_count => 0, | 5914 total_meCpG_count => 0, |
5915 total_meC_unknown_count => 0, | |
5560 total_unmethylated_CHH_count => 0, | 5916 total_unmethylated_CHH_count => 0, |
5561 total_unmethylated_CHG_count => 0, | 5917 total_unmethylated_CHG_count => 0, |
5562 total_unmethylated_CpG_count => 0, | 5918 total_unmethylated_CpG_count => 0, |
5919 total_unmethylated_C_unknown_count => 0, | |
5563 sequences_count => 0, | 5920 sequences_count => 0, |
5564 no_single_alignment_found => 0, | 5921 no_single_alignment_found => 0, |
5565 unsuitable_sequence_count => 0, | 5922 unsuitable_sequence_count => 0, |
5566 genomic_sequence_could_not_be_extracted_count => 0, | 5923 genomic_sequence_could_not_be_extracted_count => 0, |
5567 unique_best_alignment_count => 0, | 5924 unique_best_alignment_count => 0, |
5741 my $non_bs_mm; | 6098 my $non_bs_mm; |
5742 my $samtools_path; | 6099 my $samtools_path; |
5743 my $bam; | 6100 my $bam; |
5744 my $gzip; | 6101 my $gzip; |
5745 my $pbat; | 6102 my $pbat; |
6103 my $prefix; | |
6104 my $old_flag; | |
5746 | 6105 |
5747 my $command_line = GetOptions ('help|man' => \$help, | 6106 my $command_line = GetOptions ('help|man' => \$help, |
5748 '1=s' => \$mates1, | 6107 '1=s' => \$mates1, |
5749 '2=s' => \$mates2, | 6108 '2=s' => \$mates2, |
5750 'path_to_bowtie=s' => \$path_to_bowtie, | 6109 'path_to_bowtie=s' => \$path_to_bowtie, |
5782 'non_bs_mm' => \$non_bs_mm, | 6141 'non_bs_mm' => \$non_bs_mm, |
5783 'samtools_path=s' => \$samtools_path, | 6142 'samtools_path=s' => \$samtools_path, |
5784 'bam' => \$bam, | 6143 'bam' => \$bam, |
5785 'gzip' => \$gzip, | 6144 'gzip' => \$gzip, |
5786 'pbat' => \$pbat, | 6145 'pbat' => \$pbat, |
6146 'prefix=s' => \$prefix, | |
6147 'old_flag' => \$old_flag, | |
5787 ); | 6148 ); |
5788 | 6149 |
5789 | 6150 |
5790 ### EXIT ON ERROR if there were errors with any of the supplied options | 6151 ### EXIT ON ERROR if there were errors with any of the supplied options |
5791 unless ($command_line){ | 6152 unless ($command_line){ |
5937 ### checking the integrity of $CT_dir | 6298 ### checking the integrity of $CT_dir |
5938 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | 6299 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; |
5939 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); | 6300 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); |
5940 foreach my $file(@CT_bowtie_index){ | 6301 foreach my $file(@CT_bowtie_index){ |
5941 unless (-f $file){ | 6302 unless (-f $file){ |
5942 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n"; | 6303 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n"; |
5943 } | 6304 } |
5944 } | 6305 } |
5945 ### checking the integrity of $GA_dir | 6306 ### checking the integrity of $GA_dir |
5946 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | 6307 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; |
5947 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); | 6308 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); |
5948 foreach my $file(@GA_bowtie_index){ | 6309 foreach my $file(@GA_bowtie_index){ |
5949 unless (-f $file){ | 6310 unless (-f $file){ |
5950 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; | 6311 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n"; |
5951 } | 6312 } |
5952 } | 6313 } |
5953 } | 6314 } |
5954 | 6315 |
5955 else{ ### Bowtie 1 (default) | 6316 else{ ### Bowtie 1 (default) |
5956 ### checking the integrity of $CT_dir | 6317 ### checking the integrity of $CT_dir |
5957 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | 6318 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; |
5958 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); | 6319 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); |
5959 foreach my $file(@CT_bowtie_index){ | 6320 foreach my $file(@CT_bowtie_index){ |
5960 unless (-f $file){ | 6321 unless (-f $file){ |
5961 die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; | 6322 die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; |
5962 } | 6323 } |
5963 } | 6324 } |
5964 ### checking the integrity of $GA_dir | 6325 ### checking the integrity of $GA_dir |
5965 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | 6326 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; |
5966 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); | 6327 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); |
5967 foreach my $file(@GA_bowtie_index){ | 6328 foreach my $file(@GA_bowtie_index){ |
5968 unless (-f $file){ | 6329 unless (-f $file){ |
5969 die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n"; | 6330 die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; |
5970 } | 6331 } |
5971 } | 6332 } |
5972 } | 6333 } |
5973 | 6334 |
5974 my $CT_index_basename = "${CT_dir}BS_CT"; | 6335 my $CT_index_basename = "${CT_dir}BS_CT"; |
6245 } | 6606 } |
6246 if ($bowtie2){ | 6607 if ($bowtie2){ |
6247 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments | 6608 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments |
6248 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones | 6609 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones |
6249 } | 6610 } |
6611 | |
6612 if ($old_flag){ | |
6613 warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); | |
6614 sleep(3); | |
6615 } | |
6250 } | 6616 } |
6251 elsif ($mates2){ | 6617 elsif ($mates2){ |
6252 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n"; | 6618 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n"; |
6253 } | 6619 } |
6254 | 6620 |
6398 if ($vanilla){ | 6764 if ($vanilla){ |
6399 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; | 6765 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; |
6400 } | 6766 } |
6401 } | 6767 } |
6402 | 6768 |
6403 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat); | 6769 ### PREFIX FOR OUTPUT FILES |
6770 if ($prefix){ | |
6771 # removing trailing dots | |
6772 | |
6773 $prefix =~ s/\.+$//; | |
6774 | |
6775 warn "Using the following prefix for output files: $prefix\n\n"; | |
6776 sleep(1); | |
6777 } | |
6778 | |
6779 | |
6780 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag); | |
6404 } | 6781 } |
6405 | 6782 |
6406 | 6783 |
6407 | 6784 |
6408 sub generate_SAM_header{ | 6785 sub generate_SAM_header{ |
6636 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; | 7013 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; |
6637 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; | 7014 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; |
6638 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; | 7015 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; |
6639 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; | 7016 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; |
6640 | 7017 |
6641 my $id_1 = $id.'/1'; | 7018 my $id_1; |
6642 my $id_2 = $id.'/2'; | 7019 my $id_2; |
7020 | |
7021 if ($old_flag){ | |
7022 $id_1 = $id.'/1'; | |
7023 $id_2 = $id.'/2'; | |
7024 } | |
7025 else{ | |
7026 $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard | |
7027 $id_2 = $id; | |
7028 } | |
6643 | 7029 |
6644 # Allows all degenerate nucleotide sequences in reference genome | 7030 # Allows all degenerate nucleotide sequences in reference genome |
6645 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i; | 7031 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i; |
6646 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i; | 7032 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i; |
6647 | 7033 |
6757 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account | 7143 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account |
6758 | 7144 |
6759 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) | 7145 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) |
6760 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) | 7146 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) |
6761 | 7147 |
6762 my $flag_1; # FLAG variable used for SAM format | 7148 my $flag_1; # FLAG variable used for SAM format |
6763 my $flag_2; | 7149 my $flag_2; |
6764 | 7150 |
7151 ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH) | |
7152 | |
6765 if ($index == 0){ # OT | 7153 if ($index == 0){ # OT |
6766 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) | 7154 unless ($old_flag){ |
6767 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) | 7155 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) |
7156 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) | |
7157 } | |
7158 else{ | |
7159 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) | |
7160 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) | |
7161 } | |
6768 } | 7162 } |
6769 elsif ($index == 1){ # CTOB | 7163 elsif ($index == 1){ # CTOB |
6770 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) | 7164 unless($old_flag){ |
6771 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) | 7165 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) |
7166 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) | |
7167 } | |
7168 else{ | |
7169 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) | |
7170 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) | |
7171 } | |
6772 } | 7172 } |
6773 elsif ($index == 2){ # CTOT | 7173 elsif ($index == 2){ # CTOT |
6774 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) | 7174 unless ($old_flag){ |
6775 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) | 7175 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) |
7176 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) | |
7177 } | |
7178 else{ | |
7179 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) | |
7180 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) | |
7181 } | |
6776 } | 7182 } |
6777 elsif ($index == 3){ # OB | 7183 elsif ($index == 3){ # OB |
6778 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) | 7184 unless ($old_flag){ |
6779 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) | 7185 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) |
7186 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) | |
7187 } | |
7188 else{ | |
7189 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) | |
7190 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) | |
7191 } | |
6780 } | 7192 } |
6781 | 7193 |
6782 ##### | 7194 ##### |
6783 | 7195 |
6784 my $mapq = 255; # Mapping quality is unavailable | 7196 my $mapq = 255; # Mapping quality is unavailable |
6844 # <----------- read 2 read 2 contained within read 1 | 7256 # <----------- read 2 read 2 contained within read 1 |
6845 # | 7257 # |
6846 # or | 7258 # or |
6847 # | 7259 # |
6848 # -------------------------> read 1 | 7260 # -------------------------> read 1 |
6849 # <----------- read 2 read 2 contained within read 1 | 7261 # <------------------------ read 2 read 2 contained within read 1 |
6850 | 7262 |
6851 # start and end of read 2 are fully contained within read 1 | 7263 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable |
6852 $tlen_1 = 0; # Set as 0 when the information is unavailable | 7264 $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, |
6853 $tlen_2 = 0; # Set as 0 when the information is unavailable | 7265 $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this |
7266 ### as a request by frozenlyse on SeqAnswers on 24 July 2013 | |
6854 } | 7267 } |
6855 | 7268 |
6856 } | 7269 } |
6857 | 7270 |
6858 elsif ($start_read_2 < $start_read_1){ | 7271 elsif ($start_read_2 < $start_read_1){ |
6880 # <----------- read 1 read 1 contained within read 2 | 7293 # <----------- read 1 read 1 contained within read 2 |
6881 # | 7294 # |
6882 # or | 7295 # or |
6883 # | 7296 # |
6884 # -------------------------> read 2 | 7297 # -------------------------> read 2 |
6885 # <----------- read 1 read 1 contained within read 2 | 7298 # <------------------------ read 1 read 1 contained within read 2 |
6886 | 7299 |
6887 # start and end of read 1 are fully contained within read 2 | 7300 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable |
6888 $tlen_1 = 0; # Set as 0 when the information is unavailable | 7301 $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, |
6889 $tlen_2 = 0; # Set as 0 when the information is unavailable | 7302 $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this |
6890 } | 7303 ### as a request by frozenlyse on SeqAnswers on 24 July 2013 |
7304 } | |
6891 } | 7305 } |
6892 } | 7306 } |
6893 | 7307 |
6894 else{ # Bowtie 1 | 7308 else{ # Bowtie 1 |
6895 | 7309 |
7351 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). | 7765 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). |
7352 | 7766 |
7353 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified | 7767 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified |
7354 explicitly if Samtools is in the PATH already. | 7768 explicitly if Samtools is in the PATH already. |
7355 | 7769 |
7770 --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For | |
7771 example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. | |
7772 | |
7773 --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, | |
7774 this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since | |
7775 both the appended read IDs and custom FLAG values may cause problems with some downstream tools | |
7776 such as Picard, new defaults were implemented as of version 0.8.3. | |
7777 | |
7778 | |
7779 default old_flag | |
7780 =================== =================== | |
7781 Read 1 Read 2 Read 1 Read 2 | |
7782 | |
7783 OT: 99 147 67 131 | |
7784 | |
7785 OB: 83 163 115 179 | |
7786 | |
7787 CTOT: 99 147 67 131 | |
7788 | |
7789 CTOB: 83 163 115 179 | |
7790 | |
7356 | 7791 |
7357 | 7792 |
7358 Other: | 7793 Other: |
7359 | 7794 |
7360 -h/--help Displays this help file. | 7795 -h/--help Displays this help file. |
7516 (17) XA/XB-tag (non-bisulfite mismatches) (optional!) | 7951 (17) XA/XB-tag (non-bisulfite mismatches) (optional!) |
7517 | 7952 |
7518 Each read of paired-end alignments is written out in a separate line in the above format. | 7953 Each read of paired-end alignments is written out in a separate line in the above format. |
7519 | 7954 |
7520 | 7955 |
7521 Last edited on 10 May 2013. | 7956 Last edited on 07 October 2013. |
7522 | 7957 |
7523 HOW_TO | 7958 HOW_TO |
7524 } | 7959 } |