annotate bismark @ 0:62c6da72dd4a draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 09:57:36 -0400
parents
children 91f07ff056ca
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/perl --
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
2 use strict;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
3 use warnings;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
4 use IO::Handle;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
5 use Cwd;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
6 $|++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
7 use Getopt::Long;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
8
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
9
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
10 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
11
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
12 ## This program is free software: you can redistribute it and/or modify
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
13 ## it under the terms of the GNU General Public License as published by
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
14 ## the Free Software Foundation, either version 3 of the License, or
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
15 ## (at your option) any later version.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
16
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
17 ## This program is distributed in the hope that it will be useful,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
20 ## GNU General Public License for more details.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
21
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
22 ## You should have received a copy of the GNU General Public License
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
24
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
25
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
26 my $parent_dir = getcwd;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
27 my $bismark_version = 'v0.7.12';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
28 my $command_line = join (" ",@ARGV);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
29
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
31 foreach my $arg (@ARGV){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
32 if ($arg eq '--solexa1.3-quals'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
33 $arg = '--phred64-quals';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
34 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
35 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
36 my @filenames; # will be populated by processing the command line
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
37
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat) = process_command_line();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
39
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
42 my %counting; # counting various events
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
43
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
44 my $seqID_contains_tabs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
45
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
46 foreach my $filename (@filenames){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
47
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
49 ### resetting the counting hash and fhs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
50 reset_counters_and_fhs($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
51 $seqID_contains_tabs = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
52
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
53 ### PAIRED-END ALIGNMENTS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
54 if ($filename =~ ','){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
56
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
61
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
62 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
63
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
65 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
66
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
67 ### additional variables only for paired-end alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
69
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
70 ### FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
71 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
72 warn "Input files are in FastA format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
73
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
74 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
77
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
80 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
81 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
82 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
83 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
86 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
87 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
90
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
99 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
100
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
101 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
103 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
104 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
106 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
107 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
108
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
109 ### FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
110 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
111 warn "Input files are in FastQ format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
112 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
113 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
114 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
115 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
116
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
117 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
118 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
119 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
120 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
121 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
122 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
123 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
124 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
125 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
126 else{ # Bowtie 1 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
127 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
128 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
129
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
130 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
131 $fhs[0]->{inputfile_2} = undef; # no longer needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
132 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
133 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
134 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
135 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
136 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
137 $fhs[3]->{inputfile_2} = undef; # no longer needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
138 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
139 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
140 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
141 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
142
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
143 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
144 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
145 $fhs[1]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
146 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
147 $fhs[2]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
148 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
149 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
150 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
151 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
152 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
153 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
154 elsif($pbat){ # PBAT-Seq
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
155 ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
156 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
157 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
158
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
159 $fhs[0]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
160 $fhs[0]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
161 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
162 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
163 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
164 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
165 $fhs[3]->{inputfile_1} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
166 $fhs[3]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
167 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
168 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
169 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
170 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
171 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
172
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
173 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
174 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
175 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
176 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
177 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
178 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
179 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
180 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
181 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
182 else{ # Bowtie 1 alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
183 if ($gzip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
184 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
185
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
186 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
187 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
188 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
189 $fhs[1]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
190 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
191 $fhs[2]->{inputfile_2} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
192 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
193 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
194 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
195 else{ #uncompressed temp files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
196 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
197 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
198
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
199 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
200 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
201 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
202 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
203 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
204 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
205 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
206 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
207 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
208 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
209 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
210 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
211 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
212 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
213 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
214 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
215 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
216 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
217 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
218 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
219
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
220 ### Else we are performing SINGLE-END ALIGNMENTS
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
221 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
222 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
223 ### Initialising bisulfite conversion filenames
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
224 my ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
225
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
226
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
227 ### FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
228 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
229 warn "Inut file is in FastA format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
230 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
231 ($C_to_T_infile) = biTransformFastAFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
232 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
233 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
234 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
235 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
236 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
237 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
238 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
239
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
240 ### Creating 4 different bowtie filehandles and storing the first entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
241 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
242 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
243 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
244 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
245 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
246 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
247 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
248
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
249 ## FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
250 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
251 warn "Input file is in FastQ format\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
252 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
253 ($C_to_T_infile) = biTransformFastQFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
254 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
255 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
256 elsif($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
257 ($G_to_A_infile) = biTransformFastQFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
258 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
259 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
260 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
261 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
262 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
263 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
264 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
265
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
266 ### Creating up to 4 different bowtie filehandles and storing the first entry
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
267 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
268 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
269 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
270 elsif ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
271 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
272 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
273 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
274 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
275 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
276 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
277
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
278 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
279
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
280 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
281 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
282
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
283 sub start_methylation_call_procedure_single_ends {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
284 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
285 my ($dir,$filename);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
286
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
287 if ($sequence_file =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
288 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
289 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
290 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
291 $filename = $sequence_file;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
292 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
293
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
294 ### printing all alignments to a results file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
295 my $outfile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
296
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
297 if ($bowtie2){ # SAM format is the default for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
298 $outfile =~ s/$/_bt2_bismark.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
299 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
300 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
301 $outfile =~ s/$/_bismark.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
302 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
303 else{ # SAM is the default output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
304 $outfile =~ s/$/_bismark.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
306
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
307 $bam = 0 unless (defined $bam);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
308
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
309 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
310 $outfile =~ s/sam/bam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
311 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
312 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
313 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
314 $outfile .= '.gz';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
315 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
316 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
317 else{ # uncompressed ouput, default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
318 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
319 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
320
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
321 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
322 sleep(1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
323
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
324 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
325 print OUT "Bismark version: $bismark_version\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
326 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
327
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
328 ### printing alignment and methylation call summary to a report file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
329 my $reportfile = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
330 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
331 $reportfile =~ s/$/_bt2_bismark_SE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
332 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
333 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
334 $reportfile =~ s/$/_bismark_SE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
335 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
336
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
337 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
338 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
339
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
340 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
341 my $unmapped_file = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
342 $unmapped_file =~ s/$/_unmapped_reads.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
343 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
344 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
345 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
346 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
347 my $ambiguous_file = $filename;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
348 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
349 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
350 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
351 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
352
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
353 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
354 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
355 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
356 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
357
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
358
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
359 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
360 unless (%chromosomes){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
361 my $cwd = getcwd; # storing the path of the current working directory
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
362 print "Current working directory is: $cwd\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
363 read_genome_into_memory($cwd);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
364 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
365
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
366 unless ($vanilla or $sam_no_hd){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
367 generate_SAM_header();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
368 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
369
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
370 ### Input file is in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
371 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
372 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
373 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
374 ### Input file is in FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
375 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
376 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
377 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
378 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
379
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
380 sub start_methylation_call_procedure_paired_ends {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
381 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
382
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
383 my ($dir_1,$filename_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
384
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
385 if ($sequence_file_1 =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
386 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
387 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
388 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
389 $filename_1 = $sequence_file_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
390 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
391
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
392 my ($dir_2,$filename_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
393
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
394 if ($sequence_file_2 =~ /\//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
395 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
396 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
397 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
398 $filename_2 = $sequence_file_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
399 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
400
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
401 ### printing all alignments to a results file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
402 my $outfile = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
403 if ($bowtie2){ # SAM format is the default Bowtie 2 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
404 $outfile =~ s/$/_bismark_bt2_pe.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
405 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
406 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
407 $outfile =~ s/$/_bismark_pe.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
408 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
409 else{ # SAM format is the default Bowtie 1 output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
410 $outfile =~ s/$/_bismark_pe.sam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
411 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
412
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
413 $bam = 0 unless (defined $bam);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
414
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
415 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
416 $outfile =~ s/sam/bam/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
417 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
418 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
419 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
420 $outfile .= '.gz';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
421 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
422 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
423 else{ # uncompressed ouput, default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
424 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
425 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
426
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
427 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
428 sleep(1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
429
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
430 if ($vanilla){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
431 print OUT "Bismark version: $bismark_version\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
432 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
433
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
434 ### printing alignment and methylation call summary to a report file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
435 my $reportfile = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
436 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
437 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
439 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
440 $reportfile =~ s/$/_bismark_PE_report.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
441 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
442
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
443 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
444 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
445 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
446
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
447
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
448 ### Unmapped read output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
449 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
450 my $unmapped_1 = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
451 my $unmapped_2 = $filename_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
452 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
453 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
454 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
455 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
456 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
457 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
458
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
459 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
460 my $amb_1 = $filename_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
461 my $amb_2 = $filename_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
462 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
463 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
464 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
465 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
466 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
467 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
468
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
469 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
470 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
471 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
472
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
473 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
474 unless (%chromosomes){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
475 my $cwd = getcwd; # storing the path of the current working directory
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
476 print "Current working directory is: $cwd\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
477 read_genome_into_memory($cwd);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
478 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
479
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
480 unless ($vanilla or $sam_no_hd){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
481 generate_SAM_header();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
482 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
483
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
484 ### Input files are in FastA format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
485 if ($sequence_file_format eq 'FASTA'){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
486 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
487 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
488 ### Input files are in FastQ format
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
489 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
490 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
491 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
492 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
493
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
494 sub print_final_analysis_report_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
495 my ($C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
496 ### All sequences from the original sequence file have been analysed now
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
497 ### deleting temporary C->T or G->A infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
498
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
499 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
500 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
501 if ($deletion_successful == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
502 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
503 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
504 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
505 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
506 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
507 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
508 elsif ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
509 my $deletion_successful = unlink "$temp_dir$G_to_A_infile";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
510 if ($deletion_successful == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
511 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
512 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
513 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
514 warn "Could not delete temporary file $G_to_A_infile properly $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
515 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
516 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
517 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
518 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
519 if ($deletion_successful == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
520 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
521 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
522 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
523 warn "Could not delete temporary files properly $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
524 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
525 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
526
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
527 ### printing a final report for the alignment procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
528 print REPORT "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
529 warn "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
530 # foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
531 # print "$fhs[$index]->{name}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
532 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
533 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
534 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
535
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
536 ### printing a final report for the methylation call procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
537 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
538 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
539 my $percent_alignable_sequences;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
540
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
541 if ($counting{sequences_count} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
542 $percent_alignable_sequences = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
543 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
544 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
545 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
546 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
547
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
548 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
549 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
550
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
551 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
552 ### only calculating the percentage if there were any overruled alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
553 if ($counting{low_complexity_alignments_overruled_count}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
554 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
555 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
556 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
557
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
558 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
559 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
560 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
561 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
562 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
563
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
564 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
565 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
566 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
567 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
568 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
569
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
570 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
571 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
572 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
573 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
574
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
575 ### detailed information about Cs analysed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
576 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
577 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
578 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
579 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
580 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
581 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
582 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
583 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
584 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
585
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
586 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
587 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
588 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
589 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
590 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
591 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
592 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
593 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
594
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
595 my $percent_meCHG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
596 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
597 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
598 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
599
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
600 my $percent_meCHH;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
601 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
602 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
603 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
604
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
605 my $percent_meCpG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
606 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
607 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
608 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
609
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
610 ### printing methylated CpG percentage if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
611 if ($percent_meCpG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
612 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
613 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
614 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
615 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
616 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
617 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
618 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
619
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
620 ### printing methylated C percentage (CHG context) if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
621 if ($percent_meCHG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
622 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
623 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
624 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
625 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
626 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
627 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
628 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
629
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
630 ### printing methylated C percentage (CHH context) if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
631 if ($percent_meCHH){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
632 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
633 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
634 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
635 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
636 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
637 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
638 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
639
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
640 if ($seqID_contains_tabs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
641 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
642 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
643 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
644 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
645
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
646 sub print_final_analysis_report_paired_ends{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
647 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
648 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
649 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
650 if ($G_to_A_infile_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
651 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
652 if ($deletion_successful == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
653 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
654 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
655 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
656 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
657 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
658 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
659 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
660 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
661 if ($deletion_successful == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
662 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
663 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
664 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
665 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
666 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
667 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
668 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
669 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
670 if ($G_to_A_infile_2 and $C_to_T_infile_2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
671 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
672 if ($deletion_successful == 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
673 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
674 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
675 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
676 warn "Could not delete temporary files properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
677 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
678 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
679 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
680 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
681 if ($deletion_successful == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
682 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
683 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
684 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
685 warn "Could not delete temporary files properly: $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
686 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
687 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
688 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
689
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
690 ### printing a final report for the alignment procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
691 warn "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
692 print REPORT "Final Alignment report\n",'='x22,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
693 # foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
694 # print "$fhs[$index]->{name}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
695 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
696 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
697 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
698
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
699 ### printing a final report for the methylation call procedure
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
700 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
701 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
702
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
703 my $percent_alignable_sequence_pairs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
704 if ($counting{sequences_count} == 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
705 $percent_alignable_sequence_pairs = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
706 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
707 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
708 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
709 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
710 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
711 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
712
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
713 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
714 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
715 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
716 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
717 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
718
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
719
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
720 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
721 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
722 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
723 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
724 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
725 ### detailed information about Cs analysed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
726
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
727 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
728 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
729 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
730 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
731
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
732 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
733 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
734
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
735 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
736 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
737 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
738 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
739 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
740 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
741 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
742 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
743
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
744 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
745 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
746 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
747 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
748 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
749 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
750 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
751
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
752 my $percent_meCHG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
753 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
754 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
755 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
756
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
757 my $percent_meCHH;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
758 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
759 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
760 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
761
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
762 my $percent_meCpG;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
763 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
764 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
765 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
766
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
767 ### printing methylated CpG percentage if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
768 if ($percent_meCpG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
769 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
770 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
771 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
772 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
773 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
774 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
775 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
776
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
777 ### printing methylated C percentage in CHG context if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
778 if ($percent_meCHG){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
779 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
780 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
781 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
782 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
783 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
784 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
785 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
786
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
787 ### printing methylated C percentage in CHH context if applicable
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
788 if ($percent_meCHH){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
789 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
790 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
791 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
792 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
793 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
794 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
795 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
796
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
797 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
798
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
799 sub process_single_end_fastA_file_for_methylation_call{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
800 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
801 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
802 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
803 ### the C->T or G->A version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
804
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
805 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
806 if ($sequence_file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
807 open (IN,"zcat $sequence_file |") or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
808 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
809 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
810 open (IN,$sequence_file) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
811 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
812
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
813 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
814
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
815 warn "\nReading in the sequence file $sequence_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
816 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
817 # last if ($counting{sequences_count} > 100);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
818 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
819 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
820 last unless ($identifier and $sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
821
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
822 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
823
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
824 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
825
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
826 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
827 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
828 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
829 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
830 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
831 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
832
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
833 $counting{sequences_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
834 if ($counting{sequences_count}%100000==0) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
835 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
836 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
837 chomp $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
838 chomp $identifier;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
839
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
840 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
841
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
842 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
843 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
844 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
845 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
846 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
847 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
848 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
849
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
850 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
851 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
852 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
853
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
854 # print the sequence to ambiguous.out if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
855 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
856 print AMBIG ">$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
857 print AMBIG "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
858 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
859
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
860 # print the sequence to <unmapped.out> file if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
861 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
862 print UNMAPPED ">$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
863 print UNMAPPED "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
864 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
865 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
866 print "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
867
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
868 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
869
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
870 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
871
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
872 sub process_single_end_fastQ_file_for_methylation_call{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
873 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
874 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
875 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
876 ### the C->T or G->A version
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
877
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
878 ### gzipped version of the infile
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
879 if ($sequence_file =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
880 open (IN,"zcat $sequence_file |") or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
881 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
882 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
883 open (IN,$sequence_file) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
884 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
885
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
886 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
887
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
888 warn "\nReading in the sequence file $sequence_file\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
889 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
890 my $identifier = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
891 my $sequence = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
892 my $identifier_2 = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
893 my $quality_value = <IN>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
894 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
895
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
896 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
897
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
898 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
899
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
900 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
901 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
902 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
903 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
904 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
905 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
906
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
907 $counting{sequences_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
908
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
909 if ($counting{sequences_count}%1000000==0) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
910 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
911 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
912 chomp $sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
913 chomp $identifier;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
914 chomp $quality_value;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
915
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
916 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
917
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
918 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
919 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
920 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
921 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
922 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
923 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
924 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
925
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
926 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
927 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
928 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
929
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
930 # print the sequence to ambiguous.out if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
931 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
932 print AMBIG "\@$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
933 print AMBIG "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
934 print AMBIG $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
935 print AMBIG "$quality_value\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
936 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
937
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
938 # print the sequence to <unmapped.out> file if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
939 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
940 print UNMAPPED "\@$identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
941 print UNMAPPED "$sequence\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
942 print UNMAPPED $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
943 print UNMAPPED "$quality_value\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
944 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
945 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
946 print "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
947
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
948 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
949
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
950 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
951
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
952 sub process_fastA_files_for_paired_end_methylation_calls{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
953 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
954 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
955 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
956 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
957 ### converted genomes (either the C->T or G->A version)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
958
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
959 ### gzipped version of the infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
960 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
961 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
962 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
963 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
964 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
965 open (IN1,$sequence_file_1) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
966 open (IN2,$sequence_file_2) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
967 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
968
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
969 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
970 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
971
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
972 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
973
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
974 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
975 # reading from the first input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
976 my $identifier_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
977 my $sequence_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
978 # reading from the second input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
979 my $identifier_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
980 my $sequence_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
981 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
982
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
983 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
984 $identifier_2 = fix_IDs($identifier_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
985
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
986 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
987
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
988 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
989 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
990 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
991 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
992 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
993 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
994
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
995 $counting{sequences_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
996 if ($counting{sequences_count}%100000==0) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
997 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
998 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
999 my $orig_identifier_1 = $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1000 my $orig_identifier_2 = $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1001
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1002 chomp $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1003 chomp $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1004 chomp $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1005 chomp $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1006
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1007 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1008
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1009 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1010 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1011 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1012 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1013 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1014 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1015 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1016
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1017 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1018 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1019 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1020
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1021 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1022 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1023 print AMBIG_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1024 print AMBIG_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1025 print AMBIG_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1026 print AMBIG_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1027 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1028
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1029 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1030 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1031 print UNMAPPED_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1032 print UNMAPPED_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1033 print UNMAPPED_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1034 print UNMAPPED_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1035 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1036 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1037
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1038 warn "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1039
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1040 close OUT or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1041
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1042 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1043
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1044 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1045
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1046 sub process_fastQ_files_for_paired_end_methylation_calls{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1047 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1048 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1049 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1050 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1051 ### of the converted genomes (either C->T or G->A version)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1052
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1053 ### gzipped version of the infiles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1054 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1055 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1056 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1057 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1058 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1059 open (IN1,$sequence_file_1) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1060 open (IN2,$sequence_file_2) or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1061 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1062
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1063 my $count = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1064
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1065 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1066 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1067 while (1) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1068 # reading from the first input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1069 my $identifier_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1070 my $sequence_1 = <IN1>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1071 my $ident_1 = <IN1>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1072 my $quality_value_1 = <IN1>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1073 # reading from the second input file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1074 my $identifier_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1075 my $sequence_2 = <IN2>;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1076 my $ident_2 = <IN2>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1077 my $quality_value_2 = <IN2>; # not needed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1078 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1079
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1080 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1081 $identifier_2 = fix_IDs($identifier_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1082
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1083 ++$count;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1084
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1085 if ($skip){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1086 next unless ($count > $skip);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1087 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1088 if ($upto){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1089 last if ($count > $upto);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1090 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1091
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1092 $counting{sequences_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1093 if ($counting{sequences_count}%100000==0) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1094 warn "Processed $counting{sequences_count} sequences so far\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1095 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1096
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1097 my $orig_identifier_1 = $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1098 my $orig_identifier_2 = $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1099
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1100 chomp $sequence_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1101 chomp $identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1102 chomp $sequence_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1103 chomp $identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1104 chomp $quality_value_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1105 chomp $quality_value_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1106
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1107 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1108
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1109 my $return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1110 if ($bowtie2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1111 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1112 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1113 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1114 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1115 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1116
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1117 unless ($return){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1118 $return = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1119 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1120
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1121 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1122 if ($ambiguous and $return == 2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1123 # seq_1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1124 print AMBIG_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1125 print AMBIG_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1126 print AMBIG_1 $ident_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1127 print AMBIG_1 "$quality_value_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1128 # seq_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1129 print AMBIG_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1130 print AMBIG_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1131 print AMBIG_2 $ident_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1132 print AMBIG_2 "$quality_value_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1133 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1134
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1135 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1136 elsif ($unmapped and $return == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1137 # seq_1
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1138 print UNMAPPED_1 $orig_identifier_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1139 print UNMAPPED_1 "$sequence_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1140 print UNMAPPED_1 $ident_1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1141 print UNMAPPED_1 "$quality_value_1\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1142 # seq_2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1143 print UNMAPPED_2 $orig_identifier_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1144 print UNMAPPED_2 "$sequence_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1145 print UNMAPPED_2 $ident_2;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1146 print UNMAPPED_2 "$quality_value_2\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1147 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1148 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1149
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1150 warn "Processed $counting{sequences_count} sequences in total\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1151
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1152 close OUT or die $!;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1153
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1154 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1155
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1156 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1157
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1158 sub check_bowtie_results_single_end{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1159 my ($sequence,$identifier,$quality_value) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1160
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1161 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1162 $quality_value = 'I'x(length$sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1163 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1164
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1165 my %mismatches = ();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1166 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1167 foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1168
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1169 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1170 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1171 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1172 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1173 ###############################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1174 ### STEP I Now processing the alignment stored in last_line ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1175 ###############################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1176 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1177 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1178 ### we only continue to extract useful information about this alignment if 1 was returned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1179 if ($valid_alignment_found_1 == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1180 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1181 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1182 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1183
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1184 unless($mismatch_info){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1185 $mismatch_info = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1186 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1187
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1188 chomp $mismatch_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1189 my $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1190 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1191 $chromosome = $mapped_chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1192 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1193 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1194 die "Chromosome number extraction failed for $mapped_chromosome\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1195 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1196 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1197 my $number_of_mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1198 if ($mismatch_info eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1199 $number_of_mismatches = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1200 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1201 elsif ($mismatch_info =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1202 my @mismatches = split (/,/,$mismatch_info);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1203 $number_of_mismatches = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1204 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1205 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1206 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1207 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1208 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1209 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1210 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1211 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1212 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1213 ### number for the found alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1214 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1215 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1216 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1217 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1218 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1219 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1220 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1221 $number_of_mismatches = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1222 ##################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1223 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1224 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1225 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1226 ##################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1227 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1228 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1229 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1230 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1231 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1232 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1233 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1234 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1235 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1236 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1237 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1238 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1239 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1240 ### we only continue to extract useful information about this second alignment if 1 was returned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1241 if ($valid_alignment_found_2 == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1242 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1243 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1244 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1245 unless($mismatch_info){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1246 $mismatch_info = '';
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1247 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1248 chomp $mismatch_info;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1249
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1250 my $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1251 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1252 $chromosome = $mapped_chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1253 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1254 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1255 die "Chromosome number extraction failed for $mapped_chromosome\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1256 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1257
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1258 ### Now extracting the number of mismatches to the converted genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1259 my $number_of_mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1260 if ($mismatch_info eq ''){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1261 $number_of_mismatches = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1262 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1263 elsif ($mismatch_info =~ /^\d/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1264 my @mismatches = split (/,/,$mismatch_info);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1265 $number_of_mismatches = scalar @mismatches;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1266 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1267 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1268 die "Something weird is going on with the mismatch field\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1269 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1270 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1271 ### extracting the chromosome number from the bowtie output (see above)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1272 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1273 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1274 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1275 ### case we are not writing the same entry out a second time.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1276 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1277 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1278 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1279 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1280 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1281 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1282 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1283 ####################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1284 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1285 ####################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1286 $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1287 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1288 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1289 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1290 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1291 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1292 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1293 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1294 else {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1295 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1296 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1297 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1298 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1299 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1300 ### still within the 2nd sequence in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1301 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1302 ### still withing the 1st sequence in correct orientation found
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1303 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1304 ### still within the if (last_seq_id eq identifier) condition
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1305 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1306 ### still within foreach index loop
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1307 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1308 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1309 unless(%mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1310 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1311 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1312 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1313 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1314 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1315 return;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1316 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1317 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1318 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1319 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1320 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1321 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1322 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1323 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1324 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1325 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1326 my $sequence_fails = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1327 ### Declaring an empty hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1328 my $methylation_call_params; # hash reference!
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1329 ### sorting in ascending order
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1330 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1331
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1332 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1333 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1334 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1335 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1336 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1337 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1338 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1339 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1340 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1341 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1342 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1343 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1344 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1345 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1346 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1347 ### reaction. E.g.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1348 ### CAGTCACGCGCGCGCG will become
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1349 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1350 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1351 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1352 ### G->A conversion:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1353 ### highly methylated: CAATCACACACACACA
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1354 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1355 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1356 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1357 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1358 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1359 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1360 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1361 ### In the above example the number of transliterations required to transform the actual sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1362 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1363 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1364 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1365 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1366 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1367 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1368 my @three_candidate_seqs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1369 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1370 my $transliterations_performed;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1371 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1372 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1373 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1374 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1375 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1376 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1377 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1378 die "unexpected index number range $!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1379 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1380 push @three_candidate_seqs,{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1381 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1382 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1383 mismatch_number => $mismatch_number,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1384 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1385 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1386 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1387 transliterations_performed => $transliterations_performed,
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1388 };
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1389 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1390 ### sorting in ascending order for the lowest number of transliterations performed
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1391 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1392 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1393 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1394 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1395 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1396 if (($first_array_element*2) < $second_array_element){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1397 $counting{low_complexity_alignments_overruled_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1398 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1399 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1400 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1401 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1402 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1403 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1404 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1405 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1406 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1407 $sequence_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1408 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1409 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1410 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1411 $sequence_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1412 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1413 ### after processing the alignment with the lowest number of mismatches we exit
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1414 last;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1415 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1416 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1417 if ($sequence_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1418 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1419 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1420 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1421 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1422 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1423 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1424 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1425 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1426 return 0; # => exits to next sequence (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1427 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1428 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1429
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1430 ### --DIRECTIONAL
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1431 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1432 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1433 if ($directional){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1434 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1435 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1436 $counting{alignments_rejected_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1437 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1438 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1439 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1440
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1441 ### If the sequence has not been rejected so far it will have a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1442 $counting{unique_best_alignment_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1443 if ($pbat){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1444 extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1445 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1446 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1447 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1448 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1449
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1450 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1451 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1452 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1453 $counting{genomic_sequence_could_not_be_extracted_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1454 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1455 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1456
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1457 ### otherwise we are set to perform the actual methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1458 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1459
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1460 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1461 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1462 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1463
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1464 sub check_bowtie_results_single_end_bowtie2{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1465 my ($sequence,$identifier,$quality_value) = @_;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1466
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1467
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1468 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1469 $quality_value = 'I'x(length$sequence);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1470 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1471
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1472 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1473 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1474 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1475
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1476 my $alignment_ambiguous = 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1477
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1478 my %alignments = ();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1479
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1480 ### reading from the Bowtie 2 output filehandles
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1481 foreach my $index (0..$#fhs){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1482 # print "Index: $index\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1483 # print "$fhs[$index]->{last_line}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1484 # print "$fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1485 # sleep (1);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1486 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1487 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1488
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1489 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1490 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1491
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1492 if ($fhs[$index]->{last_seq_id} eq $identifier) {
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1493 # SAM format specifications for Bowtie 2
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1494 # (1) Name of read that aligned
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1495 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1496 # 1 The read is one of a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1497 # 2 The alignment is one end of a proper paired-end alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1498 # 4 The read has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1499 # 8 The read is one of a pair and has no reported alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1500 # 16 The alignment is to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1501 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1502 # 64 The read is mate 1 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1503 # 128 The read is mate 2 in a pair
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1504 # 256 The read has multiple mapping states
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1505 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1506 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1507 # (5) Mapping quality (255 means MAPQ is not available)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1508 # (6) CIGAR string representation of alignment (* if unavailable)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1509 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1510 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1511 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1512 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1513 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1514 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1515 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1516 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1517 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1518 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1519 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1520 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1521 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1522 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1523 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1524 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1525
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1526 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1527
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1528 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1529 if ($flag == 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1530 ## reading in the next alignment, which must be the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1531 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1532 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1533 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1534 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1535 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1536 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1537 if ($seq_id eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1538 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1539 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1540 next; # next instance
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1541 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1542 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1543 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1544 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1545 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1546 next;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1547 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1548 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1549
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1550 # if there are one or more proper alignments we can extract the chromosome number
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1551 my $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1552 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1553 $chromosome = $mapped_chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1554 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1555 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1556 die "Chromosome number extraction failed for $mapped_chromosome\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1557 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1558
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1559 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1560 my ($alignment_score,$second_best,$MD_tag);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1561 my @fields = split (/\t/,$fhs[$index]->{last_line});
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1562
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1563 foreach (11..$#fields){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1564 if ($fields[$_] =~ /AS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1565 $alignment_score = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1566 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1567 elsif ($fields[$_] =~ /XS:i:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1568 $second_best = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1569 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1570 elsif ($fields[$_] =~ /MD:Z:(.*)/){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1571 $MD_tag = $1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1572 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1573 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1574
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1575 # warn "First best alignment_score is: '$alignment_score'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1576 # warn "MD tag is: '$MD_tag'\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1577 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1578
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1579 if (defined $second_best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1580 # warn "second best alignment_score is: '$second_best'\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1581
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1582 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1583 if ($alignment_score == $second_best){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1584 $alignment_ambiguous = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1585 ## need to read and discard all additional ambiguous reads until we reach the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1586 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1587 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1588 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1589 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1590 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1591 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1592 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1593 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1594 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1595 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1596 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1597 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1598 last; # break free in case we have reached the end of the alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1599 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1600 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1601 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1602 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1603 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1604
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1605 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1606
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1607 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1608 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1609 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1610 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1611
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1612 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1613 $alignments{$alignment_location}->{seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1614 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1615 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1616 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1617 $alignments{$alignment_location}->{chromosome} = $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1618 $alignments{$alignment_location}->{position} = $position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1619 $alignments{$alignment_location}->{CIGAR} = $cigar;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1620 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1621 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1622
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1623 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1624 until ($fhs[$index]->{last_seq_id} ne $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1625 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1626 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1627 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1628 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1629 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1630 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1631 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1632 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1633 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1634 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1635 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1636 last; # break free in case we have reached the end of the alignment output
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1637 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1638 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1639 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1640 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1641 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1642 else{ # there is no second best hit, so we can just store this one and read in the next sequence
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1643
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1644 my $alignment_location = join (":",$chromosome,$position);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1645
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1646 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1647 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1648 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1649 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1650
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1651 unless (exists $alignments{$alignment_location}){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1652 $alignments{$alignment_location}->{seq_id} = $id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1653 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1654 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1655 $alignments{$alignment_location}->{index} = $index;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1656 $alignments{$alignment_location}->{chromosome} = $chromosome;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1657 $alignments{$alignment_location}->{position} = $position;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1658 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1659 $alignments{$alignment_location}->{CIGAR} = $cigar;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1660 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1661
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1662 my $newline = $fhs[$index]->{fh}-> getline();
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1663 if ($newline){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1664 chomp $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1665 my ($seq_id) = split (/\t/,$newline);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1666 $fhs[$index]->{last_seq_id} = $seq_id;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1667 $fhs[$index]->{last_line} = $newline;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1668 if ($seq_id eq $identifier){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1669 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1670 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1671 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1672 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1673 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1674 $fhs[$index]->{last_seq_id} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1675 $fhs[$index]->{last_line} = undef;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1676 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1677 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1678 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1679 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1680
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1681 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1682 if ($alignment_ambiguous == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1683 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1684 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1685 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1686 # print "$ambiguous_read_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1687
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1688 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1689 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1690 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1691 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1692 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1693 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1694 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1695 return 0;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1696 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1697 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1698
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1699 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1700 unless(%alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1701 $counting{no_single_alignment_found}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1702 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1703 # print "$unmapped_read_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1704 if ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1705 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1706 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1707 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1708 return 0; # default
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1709 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1710 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1711
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1712 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1713
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1714 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1715 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1716 ### alignment score we are discarding the sequence altogether.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1717 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1718 ### opening (5) and extending (3 per bp) the gap.
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1719
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1720 #######################################################################################################################################################
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1721
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1722 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1723 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1724
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1725 ### print contents of %alignments for debugging
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1726 # if (scalar keys %alignments > 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1727 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1728 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1729 # print "Loc: $alignment_location\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1730 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1731 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1732 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1733 # print "Index $alignments{$alignment_location}->{index}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1734 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1735 # print "pos: $alignments{$alignment_location}->{position}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1736 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1737 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1738 # print "\n******\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1739 # }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1740
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1741 ### if there is only 1 entry in the hash with we accept it as the best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1742 if (scalar keys %alignments == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1743 for my $unique_best_alignment (keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1744 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1745 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1746 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1747 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1748 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1749 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1750 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1751 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1752 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1753
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1754 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1755 ### we boot the sequence altogether
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1756 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1757 my $best_alignment_score;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1758 my $best_alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1759 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1760 # print "$alignments{$alignment_location}->{alignment_score}\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1761 unless (defined $best_alignment_score){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1762 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1763 $best_alignment_location = $alignment_location;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1764 # print "setting best alignment score: $best_alignment_score\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1765 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1766 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1767 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1768 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1769 # warn "Same alignment score, the sequence will get booted!\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1770 $sequence_fails = 1;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1771 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1772 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1773 ### else we are going to store the best alignment for further processing
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1774 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1775 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1776 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1777 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1778 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1779 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1780 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1781 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1782 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1783 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1784 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1785 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1786 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1787 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1788 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1789 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1790
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1791 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1792 if ($sequence_fails == 1){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1793 $counting{unsuitable_sequence_count}++;
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1794
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1795 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1796 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1797 # print OUT "$ambiguous_read_output\n";
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1798
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1799 if ($ambiguous){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1800 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1801 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1802 elsif ($unmapped){
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1803 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1804 }
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1805 else{
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1806 return 0; # => exits to next sequence (default)
62c6da72dd4a Uploaded
bgruening
parents:
diff changeset
1807 }