0
|
1 #!/usr/bin/perl --
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use IO::Handle;
|
|
5 use Cwd;
|
|
6 $|++;
|
|
7 use Getopt::Long;
|
|
8
|
|
9
|
|
10 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
|
|
11
|
|
12 ## This program is free software: you can redistribute it and/or modify
|
|
13 ## it under the terms of the GNU General Public License as published by
|
|
14 ## the Free Software Foundation, either version 3 of the License, or
|
|
15 ## (at your option) any later version.
|
|
16
|
|
17 ## This program is distributed in the hope that it will be useful,
|
|
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
20 ## GNU General Public License for more details.
|
|
21
|
|
22 ## You should have received a copy of the GNU General Public License
|
|
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
24
|
|
25
|
|
26 my $parent_dir = getcwd;
|
3
|
27 my $bismark_version = 'v0.10.0';
|
0
|
28 my $command_line = join (" ",@ARGV);
|
|
29
|
|
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
|
|
31 foreach my $arg (@ARGV){
|
|
32 if ($arg eq '--solexa1.3-quals'){
|
|
33 $arg = '--phred64-quals';
|
|
34 }
|
|
35 }
|
|
36 my @filenames; # will be populated by processing the command line
|
|
37
|
3
|
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line();
|
0
|
39
|
|
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
|
|
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
|
|
42 my %counting; # counting various events
|
|
43
|
|
44 my $seqID_contains_tabs;
|
|
45
|
|
46 foreach my $filename (@filenames){
|
|
47
|
|
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
|
|
49 ### resetting the counting hash and fhs
|
|
50 reset_counters_and_fhs($filename);
|
|
51 $seqID_contains_tabs = 0;
|
|
52
|
|
53 ### PAIRED-END ALIGNMENTS
|
|
54 if ($filename =~ ','){
|
|
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
|
|
56
|
|
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
|
|
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
|
|
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
|
|
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
|
|
61
|
|
62 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
|
|
63
|
|
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
|
|
65 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
|
|
66
|
|
67 ### additional variables only for paired-end alignments
|
|
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
|
|
69
|
|
70 ### FastA format
|
|
71 if ($sequence_file_format eq 'FASTA'){
|
|
72 warn "Input files are in FastA format\n";
|
|
73
|
|
74 if ($directional){
|
|
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
|
|
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
|
|
77
|
|
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
|
|
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
|
|
80 $fhs[1]->{inputfile_1} = undef;
|
|
81 $fhs[1]->{inputfile_2} = undef;
|
|
82 $fhs[2]->{inputfile_1} = undef;
|
|
83 $fhs[2]->{inputfile_2} = undef;
|
|
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
|
|
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
|
|
86 }
|
|
87 else{
|
|
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
|
|
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
|
|
90
|
|
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
|
|
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
|
|
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
|
|
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
|
|
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
|
|
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
|
|
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
|
|
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
|
|
99 }
|
|
100
|
|
101 if ($bowtie2){
|
|
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
|
|
103 }
|
|
104 else{
|
|
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
|
|
106 }
|
|
107 }
|
|
108
|
|
109 ### FastQ format
|
|
110 else{
|
|
111 warn "Input files are in FastQ format\n";
|
|
112 if ($directional){
|
|
113 if ($bowtie2){
|
|
114 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
|
|
115 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
|
|
116
|
|
117 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
|
|
118 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
|
|
119 $fhs[1]->{inputfile_1} = undef;
|
|
120 $fhs[1]->{inputfile_2} = undef;
|
|
121 $fhs[2]->{inputfile_1} = undef;
|
|
122 $fhs[2]->{inputfile_2} = undef;
|
|
123 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
|
|
124 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
|
|
125 }
|
|
126 else{ # Bowtie 1 alignments
|
|
127 if ($gzip){
|
|
128 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
|
|
129
|
|
130 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
|
|
131 $fhs[0]->{inputfile_2} = undef; # no longer needed
|
|
132 $fhs[1]->{inputfile_1} = undef;
|
|
133 $fhs[1]->{inputfile_2} = undef;
|
|
134 $fhs[2]->{inputfile_1} = undef;
|
|
135 $fhs[2]->{inputfile_2} = undef;
|
|
136 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
|
|
137 $fhs[3]->{inputfile_2} = undef; # no longer needed
|
|
138 }
|
|
139 else{
|
|
140 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
|
|
141 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
|
|
142
|
|
143 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
|
|
144 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
|
|
145 $fhs[1]->{inputfile_1} = undef;
|
|
146 $fhs[1]->{inputfile_2} = undef;
|
|
147 $fhs[2]->{inputfile_1} = undef;
|
|
148 $fhs[2]->{inputfile_2} = undef;
|
|
149 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
|
|
150 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
|
|
151 }
|
|
152 }
|
|
153 }
|
|
154 elsif($pbat){ # PBAT-Seq
|
|
155 ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1
|
|
156 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
|
|
157 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
|
|
158
|
|
159 $fhs[0]->{inputfile_1} = undef;
|
|
160 $fhs[0]->{inputfile_2} = undef;
|
|
161 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
|
|
162 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
|
|
163 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
|
|
164 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
|
|
165 $fhs[3]->{inputfile_1} = undef;
|
|
166 $fhs[3]->{inputfile_2} = undef;
|
|
167 }
|
|
168 else{
|
|
169 if ($bowtie2){
|
|
170 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
|
|
171 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
|
|
172
|
|
173 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
|
|
174 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
|
|
175 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
|
|
176 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
|
|
177 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
|
|
178 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
|
|
179 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
|
|
180 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
|
|
181 }
|
|
182 else{ # Bowtie 1 alignments
|
|
183 if ($gzip){
|
|
184 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
|
|
185
|
|
186 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
|
|
187 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files
|
|
188 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
|
|
189 $fhs[1]->{inputfile_2} = undef;
|
|
190 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
|
|
191 $fhs[2]->{inputfile_2} = undef;
|
|
192 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
|
|
193 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files
|
|
194 }
|
|
195 else{ #uncompressed temp files
|
|
196 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
|
|
197 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
|
|
198
|
|
199 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
|
|
200 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
|
|
201 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
|
|
202 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
|
|
203 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
|
|
204 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
|
|
205 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
|
|
206 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
|
|
207 }
|
|
208 }
|
|
209 }
|
|
210 if ($bowtie2){
|
|
211 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
|
|
212 }
|
|
213 else{
|
|
214 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
|
|
215 }
|
|
216 }
|
|
217 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
|
|
218 }
|
|
219
|
|
220 ### Else we are performing SINGLE-END ALIGNMENTS
|
|
221 else{
|
|
222 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
|
|
223 ### Initialising bisulfite conversion filenames
|
|
224 my ($C_to_T_infile,$G_to_A_infile);
|
|
225
|
|
226
|
|
227 ### FastA format
|
|
228 if ($sequence_file_format eq 'FASTA'){
|
|
229 warn "Inut file is in FastA format\n";
|
|
230 if ($directional){
|
|
231 ($C_to_T_infile) = biTransformFastAFiles ($filename);
|
|
232 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
|
|
233 }
|
|
234 else{
|
|
235 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
|
|
236 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
|
|
237 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
|
|
238 }
|
|
239
|
|
240 ### Creating 4 different bowtie filehandles and storing the first entry
|
|
241 if ($bowtie2){
|
|
242 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
|
|
243 }
|
|
244 else{
|
|
245 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
|
|
246 }
|
|
247 }
|
|
248
|
|
249 ## FastQ format
|
|
250 else{
|
|
251 warn "Input file is in FastQ format\n";
|
|
252 if ($directional){
|
|
253 ($C_to_T_infile) = biTransformFastQFiles ($filename);
|
|
254 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
|
|
255 }
|
|
256 elsif($pbat){
|
|
257 ($G_to_A_infile) = biTransformFastQFiles ($filename);
|
|
258 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files
|
|
259 }
|
|
260 else{
|
|
261 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
|
|
262 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
|
|
263 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
|
|
264 }
|
|
265
|
|
266 ### Creating up to 4 different bowtie filehandles and storing the first entry
|
|
267 if ($bowtie2){
|
|
268 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
|
|
269 }
|
|
270 elsif ($pbat){
|
|
271 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile);
|
|
272 }
|
|
273 else{
|
|
274 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
|
|
275 }
|
|
276 }
|
|
277
|
|
278 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
|
|
279
|
|
280 }
|
|
281 }
|
|
282
|
|
283 sub start_methylation_call_procedure_single_ends {
|
|
284 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
|
|
285 my ($dir,$filename);
|
|
286
|
|
287 if ($sequence_file =~ /\//){
|
|
288 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
|
|
289 }
|
|
290 else{
|
|
291 $filename = $sequence_file;
|
|
292 }
|
|
293
|
|
294 ### printing all alignments to a results file
|
|
295 my $outfile = $filename;
|
3
|
296 if ($prefix){
|
|
297 $outfile = "$prefix.$outfile";
|
|
298 }
|
|
299
|
0
|
300
|
|
301 if ($bowtie2){ # SAM format is the default for Bowtie 2
|
3
|
302 $outfile =~ s/$/_bismark_bt2.sam/;
|
0
|
303 }
|
|
304 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
|
|
305 $outfile =~ s/$/_bismark.txt/;
|
|
306 }
|
|
307 else{ # SAM is the default output
|
|
308 $outfile =~ s/$/_bismark.sam/;
|
|
309 }
|
|
310
|
|
311 $bam = 0 unless (defined $bam);
|
|
312
|
|
313 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
|
|
314 $outfile =~ s/sam/bam/;
|
|
315 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
|
|
316 }
|
|
317 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
|
|
318 $outfile .= '.gz';
|
|
319 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
|
|
320 }
|
|
321 else{ # uncompressed ouput, default
|
|
322 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
|
|
323 }
|
|
324
|
|
325 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n";
|
|
326 sleep(1);
|
|
327
|
|
328 if ($vanilla){
|
|
329 print OUT "Bismark version: $bismark_version\n";
|
|
330 }
|
|
331
|
|
332 ### printing alignment and methylation call summary to a report file
|
|
333 my $reportfile = $filename;
|
3
|
334 if ($prefix){
|
|
335 $reportfile = "$prefix.$reportfile";
|
|
336 }
|
|
337
|
0
|
338 if ($bowtie2){
|
3
|
339 $reportfile =~ s/$/_bismark_bt2_SE_report.txt/;
|
0
|
340 }
|
|
341 else{
|
|
342 $reportfile =~ s/$/_bismark_SE_report.txt/;
|
|
343 }
|
|
344
|
|
345 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
|
|
346 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
|
|
347
|
|
348 if ($unmapped){
|
|
349 my $unmapped_file = $filename;
|
3
|
350 if ($prefix){
|
|
351 $unmapped_file = "$prefix.$unmapped_file";
|
|
352 }
|
|
353
|
0
|
354 $unmapped_file =~ s/$/_unmapped_reads.txt/;
|
|
355 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
|
|
356 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
|
|
357 }
|
|
358 if ($ambiguous){
|
|
359 my $ambiguous_file = $filename;
|
3
|
360 if ($prefix){
|
|
361 $ambiguous_file = "$prefix.$ambiguous_file";
|
|
362 }
|
0
|
363 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
|
|
364 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
|
|
365 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
|
|
366 }
|
|
367
|
|
368 if ($directional){
|
|
369 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
|
|
370 }
|
|
371 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
372
|
|
373
|
|
374 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
|
|
375 unless (%chromosomes){
|
|
376 my $cwd = getcwd; # storing the path of the current working directory
|
|
377 print "Current working directory is: $cwd\n\n";
|
|
378 read_genome_into_memory($cwd);
|
|
379 }
|
|
380
|
|
381 unless ($vanilla or $sam_no_hd){
|
|
382 generate_SAM_header();
|
|
383 }
|
|
384
|
|
385 ### Input file is in FastA format
|
|
386 if ($sequence_file_format eq 'FASTA'){
|
|
387 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
|
|
388 }
|
|
389 ### Input file is in FastQ format
|
|
390 else{
|
|
391 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
|
|
392 }
|
|
393 }
|
|
394
|
|
395 sub start_methylation_call_procedure_paired_ends {
|
|
396 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
|
|
397
|
|
398 my ($dir_1,$filename_1);
|
|
399
|
|
400 if ($sequence_file_1 =~ /\//){
|
|
401 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
|
|
402 }
|
|
403 else{
|
|
404 $filename_1 = $sequence_file_1;
|
|
405 }
|
|
406
|
|
407 my ($dir_2,$filename_2);
|
|
408
|
|
409 if ($sequence_file_2 =~ /\//){
|
|
410 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
|
|
411 }
|
|
412 else{
|
|
413 $filename_2 = $sequence_file_2;
|
|
414 }
|
|
415
|
|
416 ### printing all alignments to a results file
|
3
|
417 my $outfile = $filename_1;
|
|
418
|
|
419 if ($prefix){
|
|
420 $outfile = "$prefix.$outfile";
|
|
421 }
|
|
422
|
0
|
423 if ($bowtie2){ # SAM format is the default Bowtie 2 output
|
|
424 $outfile =~ s/$/_bismark_bt2_pe.sam/;
|
|
425 }
|
|
426 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
|
|
427 $outfile =~ s/$/_bismark_pe.txt/;
|
|
428 }
|
|
429 else{ # SAM format is the default Bowtie 1 output
|
|
430 $outfile =~ s/$/_bismark_pe.sam/;
|
|
431 }
|
|
432
|
|
433 $bam = 0 unless (defined $bam);
|
|
434
|
|
435 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
|
|
436 $outfile =~ s/sam/bam/;
|
|
437 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
|
|
438 }
|
|
439 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
|
|
440 $outfile .= '.gz';
|
|
441 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
|
|
442 }
|
|
443 else{ # uncompressed ouput, default
|
|
444 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
|
|
445 }
|
|
446
|
|
447 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n";
|
|
448 sleep(1);
|
|
449
|
|
450 if ($vanilla){
|
|
451 print OUT "Bismark version: $bismark_version\n";
|
|
452 }
|
|
453
|
|
454 ### printing alignment and methylation call summary to a report file
|
|
455 my $reportfile = $filename_1;
|
3
|
456 if ($prefix){
|
|
457 $reportfile = "$prefix.$reportfile";
|
|
458 }
|
|
459
|
0
|
460 if ($bowtie2){
|
|
461 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
|
|
462 }
|
|
463 else{
|
|
464 $reportfile =~ s/$/_bismark_PE_report.txt/;
|
|
465 }
|
|
466
|
|
467 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
|
|
468 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
|
|
469 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
470
|
|
471
|
|
472 ### Unmapped read output
|
|
473 if ($unmapped){
|
|
474 my $unmapped_1 = $filename_1;
|
|
475 my $unmapped_2 = $filename_2;
|
3
|
476 if ($prefix){
|
|
477 $unmapped_1 = "$prefix.$unmapped_1";
|
|
478 $unmapped_2 = "$prefix.$unmapped_2";
|
|
479 }
|
0
|
480 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
|
|
481 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
|
|
482 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
|
|
483 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
|
|
484 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
|
|
485 }
|
|
486
|
|
487 if ($ambiguous){
|
|
488 my $amb_1 = $filename_1;
|
|
489 my $amb_2 = $filename_2;
|
3
|
490 if ($prefix){
|
|
491 $amb_1 = "$prefix.$amb_1";
|
|
492 $amb_2 = "$prefix.$amb_2";
|
|
493 }
|
|
494
|
0
|
495 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
|
|
496 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
|
|
497 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
|
|
498 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
|
|
499 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
|
|
500 }
|
|
501
|
|
502 if ($directional){
|
|
503 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
|
|
504 }
|
|
505
|
|
506 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
|
|
507 unless (%chromosomes){
|
|
508 my $cwd = getcwd; # storing the path of the current working directory
|
|
509 print "Current working directory is: $cwd\n\n";
|
|
510 read_genome_into_memory($cwd);
|
|
511 }
|
|
512
|
|
513 unless ($vanilla or $sam_no_hd){
|
|
514 generate_SAM_header();
|
|
515 }
|
|
516
|
|
517 ### Input files are in FastA format
|
|
518 if ($sequence_file_format eq 'FASTA'){
|
|
519 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
|
|
520 }
|
|
521 ### Input files are in FastQ format
|
|
522 else{
|
|
523 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
|
|
524 }
|
|
525 }
|
|
526
|
|
527 sub print_final_analysis_report_single_end{
|
|
528 my ($C_to_T_infile,$G_to_A_infile) = @_;
|
|
529 ### All sequences from the original sequence file have been analysed now
|
|
530 ### deleting temporary C->T or G->A infiles
|
|
531
|
|
532 if ($directional){
|
|
533 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
|
|
534 if ($deletion_successful == 1){
|
|
535 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
|
|
536 }
|
|
537 else{
|
|
538 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
|
|
539 }
|
|
540 }
|
|
541 elsif ($pbat){
|
|
542 my $deletion_successful = unlink "$temp_dir$G_to_A_infile";
|
|
543 if ($deletion_successful == 1){
|
|
544 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n";
|
|
545 }
|
|
546 else{
|
|
547 warn "Could not delete temporary file $G_to_A_infile properly $!\n";
|
|
548 }
|
|
549 }
|
|
550 else{
|
|
551 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
|
|
552 if ($deletion_successful == 2){
|
|
553 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
|
|
554 }
|
|
555 else{
|
|
556 warn "Could not delete temporary files properly $!\n";
|
|
557 }
|
|
558 }
|
|
559
|
|
560 ### printing a final report for the alignment procedure
|
|
561 print REPORT "Final Alignment report\n",'='x22,"\n";
|
|
562 warn "Final Alignment report\n",'='x22,"\n";
|
|
563 # foreach my $index (0..$#fhs){
|
|
564 # print "$fhs[$index]->{name}\n";
|
|
565 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
|
|
566 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
|
|
567 # }
|
|
568
|
|
569 ### printing a final report for the methylation call procedure
|
|
570 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
|
|
571 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
|
|
572 my $percent_alignable_sequences;
|
|
573
|
|
574 if ($counting{sequences_count} == 0){
|
|
575 $percent_alignable_sequences = 0;
|
|
576 }
|
|
577 else{
|
|
578 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
|
|
579 }
|
|
580
|
|
581 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
|
|
582 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
|
|
583
|
|
584 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
|
|
585 ### only calculating the percentage if there were any overruled alignments
|
|
586 if ($counting{low_complexity_alignments_overruled_count}){
|
|
587 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
|
|
588 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
|
|
589 }
|
|
590
|
|
591 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
|
|
592 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
|
|
593 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
|
|
594 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
|
|
595 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
|
|
596
|
|
597 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
|
|
598 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
|
|
599 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
|
|
600 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
|
|
601 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
|
|
602
|
|
603 if ($directional){
|
|
604 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
|
|
605 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
|
|
606 }
|
|
607
|
|
608 ### detailed information about Cs analysed
|
|
609 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
610 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
|
|
611 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
|
|
612 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
|
613 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
3
|
614 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
|
|
615 if ($bowtie2){
|
|
616 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
|
|
617 }
|
|
618 warn "\n";
|
|
619
|
|
620 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
621 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
622 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
|
|
623 if ($bowtie2){
|
|
624 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
|
|
625 }
|
|
626 warn "\n";
|
0
|
627
|
|
628 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
629 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
|
3
|
630
|
|
631 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
0
|
632 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
3
|
633 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
|
|
634 if ($bowtie2){
|
|
635 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
|
|
636 }
|
|
637 print REPORT "\n";
|
|
638
|
|
639 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
640 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
641 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
|
|
642 if ($bowtie2){
|
|
643 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
|
|
644 }
|
|
645 print REPORT "\n";
|
0
|
646
|
|
647 my $percent_meCHG;
|
|
648 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
|
|
649 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
|
|
650 }
|
|
651
|
|
652 my $percent_meCHH;
|
|
653 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
|
|
654 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
|
|
655 }
|
|
656
|
|
657 my $percent_meCpG;
|
|
658 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
|
|
659 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
|
|
660 }
|
|
661
|
3
|
662 my $percent_meC_unknown;
|
|
663 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
|
|
664 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
|
|
665 }
|
|
666
|
|
667
|
0
|
668 ### printing methylated CpG percentage if applicable
|
|
669 if ($percent_meCpG){
|
|
670 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
671 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
672 }
|
|
673 else{
|
|
674 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
675 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
676 }
|
|
677
|
|
678 ### printing methylated C percentage (CHG context) if applicable
|
|
679 if ($percent_meCHG){
|
|
680 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
681 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
682 }
|
|
683 else{
|
|
684 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
685 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
686 }
|
|
687
|
|
688 ### printing methylated C percentage (CHH context) if applicable
|
|
689 if ($percent_meCHH){
|
3
|
690 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
|
|
691 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
|
0
|
692 }
|
|
693 else{
|
3
|
694 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
|
|
695 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
|
|
696 }
|
|
697
|
|
698 ### printing methylated C percentage (Unknown C context) if applicable
|
|
699 if ($bowtie2){
|
|
700 if ($percent_meC_unknown){
|
|
701 warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
|
|
702 print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
|
|
703 }
|
|
704 else{
|
|
705 warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
|
|
706 print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
|
|
707 }
|
|
708 }
|
|
709 print REPORT "\n\n";
|
|
710 warn "\n\n";
|
0
|
711
|
|
712 if ($seqID_contains_tabs){
|
|
713 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
|
|
714 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
|
|
715 }
|
3
|
716
|
|
717
|
|
718 ###########################################################################################################################################
|
|
719 ### create pie-chart with mapping stats
|
|
720 ###########################################################################################################################################
|
|
721
|
|
722
|
|
723 my $filename;
|
|
724 if ($pbat){
|
|
725 $filename = $G_to_A_infile;
|
|
726 }
|
|
727 else{
|
|
728 $filename = $C_to_T_infile;
|
|
729 }
|
|
730
|
|
731 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
|
|
732 $pie_chart =~ s/gz$//;
|
|
733 $pie_chart =~ s/_C_to_T\.fastq$//;
|
|
734 $pie_chart =~ s/_G_to_A\.fastq$//;
|
|
735
|
|
736 # if ($prefix){
|
|
737 # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation
|
|
738 # }
|
|
739 $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png";
|
|
740
|
|
741
|
|
742 #Check whether the module GD::Graph is installed
|
|
743 my $gd_graph_installed = 0;
|
|
744 eval{
|
|
745 require GD::Graph::pie;
|
|
746 GD::Graph::pie->import();
|
|
747 };
|
|
748
|
|
749 unless($@) {
|
|
750 $gd_graph_installed = 1;
|
|
751 }
|
|
752 else{
|
|
753 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
|
|
754 sleep(2);
|
|
755 }
|
|
756
|
|
757 if ($gd_graph_installed){
|
|
758 warn "Generating pie chart\n\n";
|
|
759 sleep(1);
|
|
760 my $graph = GD::Graph::pie->new(600,600);
|
|
761
|
|
762 my $percent_unaligned;
|
|
763 my $percent_multiple;
|
|
764 my $percent_unextractable;
|
|
765
|
|
766 if ($counting{sequences_count}){
|
|
767 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
|
|
768 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
|
|
769 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
|
|
770 }
|
|
771 else{
|
|
772 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
|
|
773 }
|
|
774
|
|
775 my @aln_stats = (
|
|
776 ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
|
|
777 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
|
|
778 );
|
|
779
|
|
780 $graph->set(
|
|
781 start_angle => 180,
|
|
782 '3d' => 0,
|
|
783 label => 'Alignment stats (single-end)',
|
|
784 suppress_angle => 2, # Only label slices of sufficient size
|
|
785 transparent => 0,
|
|
786 dclrs => [ qw(red lorange dgreen cyan) ],
|
|
787 ) or die $graph->error;
|
|
788
|
|
789 my $gd = $graph->plot(\@aln_stats) or die $graph->error;
|
|
790
|
|
791 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
|
|
792 binmode PIE;
|
|
793 print PIE $gd->png;
|
|
794 }
|
|
795
|
|
796 warn "====================\nBismark run complete\n====================\n\n";
|
|
797
|
0
|
798 }
|
|
799
|
3
|
800
|
0
|
801 sub print_final_analysis_report_paired_ends{
|
|
802 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
|
|
803 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
|
|
804 if ($directional){
|
|
805 if ($G_to_A_infile_2){
|
|
806 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
|
|
807 if ($deletion_successful == 2){
|
|
808 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
|
|
809 }
|
|
810 else{
|
|
811 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
|
|
812 }
|
|
813 }
|
|
814 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete
|
|
815 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1";
|
|
816 if ($deletion_successful == 1){
|
|
817 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n";
|
|
818 }
|
|
819 else{
|
|
820 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n";
|
|
821 }
|
|
822 }
|
|
823 }
|
|
824 else{
|
|
825 if ($G_to_A_infile_2 and $C_to_T_infile_2){
|
|
826 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
|
|
827 if ($deletion_successful == 4){
|
|
828 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
|
|
829 }
|
|
830 else{
|
|
831 warn "Could not delete temporary files properly: $!\n";
|
|
832 }
|
|
833 }
|
|
834 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete
|
|
835 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1";
|
|
836 if ($deletion_successful == 2){
|
|
837 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n";
|
|
838 }
|
|
839 else{
|
|
840 warn "Could not delete temporary files properly: $!\n";
|
|
841 }
|
|
842 }
|
|
843 }
|
|
844
|
|
845 ### printing a final report for the alignment procedure
|
|
846 warn "Final Alignment report\n",'='x22,"\n";
|
|
847 print REPORT "Final Alignment report\n",'='x22,"\n";
|
|
848 # foreach my $index (0..$#fhs){
|
|
849 # print "$fhs[$index]->{name}\n";
|
|
850 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
|
|
851 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
|
|
852 # }
|
|
853
|
|
854 ### printing a final report for the methylation call procedure
|
|
855 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
|
|
856 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
|
|
857
|
|
858 my $percent_alignable_sequence_pairs;
|
|
859 if ($counting{sequences_count} == 0){
|
|
860 $percent_alignable_sequence_pairs = 0;
|
|
861 }
|
|
862 else{
|
|
863 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
|
|
864 }
|
|
865 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
|
|
866 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
|
|
867
|
|
868 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
|
|
869 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
|
|
870 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
|
|
871 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
|
|
872 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
|
|
873
|
|
874
|
|
875 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
|
|
876 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
|
|
877 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
|
|
878 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
|
|
879 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
|
|
880 ### detailed information about Cs analysed
|
|
881
|
|
882 if ($directional){
|
|
883 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
|
|
884 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
|
|
885 }
|
|
886
|
|
887 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
888 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
|
|
889
|
|
890 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
|
|
891 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
|
|
892 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
|
893 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
3
|
894 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
|
|
895 if ($bowtie2){
|
|
896 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
|
|
897 }
|
|
898 warn "\n";
|
|
899
|
|
900 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
901 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
902 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
|
|
903 if ($bowtie2){
|
|
904 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
|
|
905 }
|
|
906 warn "\n";
|
0
|
907
|
|
908 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
|
|
909 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
|
|
910 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
|
3
|
911 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
|
|
912 if ($bowtie2){
|
|
913 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
|
|
914 }
|
|
915 print REPORT "\n";
|
|
916
|
|
917 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
|
|
918 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
|
|
919 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
|
|
920 if ($bowtie2){
|
|
921 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
|
|
922 }
|
|
923 print REPORT "\n";
|
0
|
924
|
|
925 my $percent_meCHG;
|
|
926 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
|
|
927 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
|
|
928 }
|
|
929
|
|
930 my $percent_meCHH;
|
|
931 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
|
|
932 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
|
|
933 }
|
|
934
|
|
935 my $percent_meCpG;
|
|
936 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
|
|
937 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
|
|
938 }
|
|
939
|
3
|
940 my $percent_meC_unknown;
|
|
941 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
|
|
942 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
|
|
943 }
|
|
944
|
|
945
|
0
|
946 ### printing methylated CpG percentage if applicable
|
|
947 if ($percent_meCpG){
|
|
948 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
949 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
|
|
950 }
|
|
951 else{
|
|
952 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
953 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
|
|
954 }
|
|
955
|
|
956 ### printing methylated C percentage in CHG context if applicable
|
|
957 if ($percent_meCHG){
|
|
958 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
959 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
|
|
960 }
|
|
961 else{
|
|
962 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
963 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
|
|
964 }
|
|
965
|
|
966 ### printing methylated C percentage in CHH context if applicable
|
|
967 if ($percent_meCHH){
|
3
|
968 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
|
|
969 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
|
|
970 }
|
|
971 else{
|
|
972 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
|
|
973 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
|
|
974 }
|
|
975
|
|
976 ### printing methylated C percentage (Unknown C context) if applicable
|
|
977 if ($bowtie2){
|
|
978 if ($percent_meC_unknown){
|
|
979 warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
|
|
980 print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
|
|
981 }
|
|
982 else{
|
|
983 warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
|
|
984 print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
|
|
985 }
|
|
986 }
|
|
987 print REPORT "\n\n";
|
|
988 warn "\n\n";
|
|
989
|
|
990
|
|
991 ############################################################################################################################################
|
|
992 ### create pie-chart with mapping stats
|
|
993 ###########################################################################################################################################
|
|
994
|
|
995 my $filename;
|
|
996 if ($pbat){
|
|
997 $filename = $G_to_A_infile_1;
|
0
|
998 }
|
|
999 else{
|
3
|
1000 $filename = $C_to_T_infile_1;
|
|
1001 }
|
|
1002
|
|
1003 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
|
|
1004 $pie_chart =~ s/gz$//;
|
|
1005 $pie_chart =~ s/_C_to_T.fastq$//;
|
|
1006 $pie_chart =~ s/_G_to_A.fastq$//;
|
|
1007 ### special format for gzipped PE Bowtie1 files
|
|
1008 $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//;
|
|
1009 $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//;
|
|
1010
|
|
1011 if ($prefix){
|
|
1012 # prefix is now being prepended to the temp files already
|
|
1013 # $pie_chart = "$prefix.$pie_chart";
|
|
1014 }
|
|
1015 $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png";
|
|
1016
|
|
1017 #Check whether the module GD::Graph is installed
|
|
1018 my $gd_graph_installed = 0;
|
|
1019 eval{
|
|
1020 require GD::Graph::pie;
|
|
1021 GD::Graph::pie->import();
|
|
1022 };
|
|
1023
|
|
1024 unless($@) {
|
|
1025 $gd_graph_installed = 1;
|
|
1026 }
|
|
1027 else{
|
|
1028 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
|
|
1029 sleep(2);
|
|
1030 }
|
|
1031
|
|
1032 if ($gd_graph_installed){
|
|
1033 warn "Generating pie chart\n\n";
|
|
1034 sleep(1);
|
|
1035 my $graph = GD::Graph::pie->new(600,600);
|
|
1036
|
|
1037 my $percent_unaligned;
|
|
1038 my $percent_multiple;
|
|
1039 my $percent_unextractable;
|
|
1040
|
|
1041 if ($counting{sequences_count}){
|
|
1042 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
|
|
1043 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
|
|
1044 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
|
|
1045 }
|
|
1046 else{
|
|
1047 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
|
|
1048 }
|
|
1049
|
|
1050 my @aln_stats = (
|
|
1051 ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
|
|
1052 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
|
|
1053 );
|
|
1054
|
|
1055 # push @{$mbias_read1[0]},$pos;
|
|
1056
|
|
1057 $graph->set(
|
|
1058 start_angle => 180,
|
|
1059 '3d' => 0,
|
|
1060 label => 'Alignment stats (paired-end)',
|
|
1061 suppress_angle => 2, # Only label slices of sufficient size
|
|
1062 transparent => 0,
|
|
1063 dclrs => [ qw(red lorange dgreen cyan) ],
|
|
1064 ) or die $graph->error;
|
|
1065
|
|
1066 my $gd = $graph->plot(\@aln_stats) or die $graph->error;
|
|
1067
|
|
1068 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
|
|
1069 binmode PIE;
|
|
1070 print PIE $gd->png;
|
|
1071 }
|
|
1072
|
|
1073 warn "====================\nBismark run complete\n====================\n\n";
|
0
|
1074
|
|
1075 }
|
|
1076
|
|
1077 sub process_single_end_fastA_file_for_methylation_call{
|
|
1078 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
|
|
1079 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
|
|
1080 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
|
|
1081 ### the C->T or G->A version
|
|
1082
|
|
1083 ### gzipped version of the infile
|
|
1084 if ($sequence_file =~ /\.gz$/){
|
|
1085 open (IN,"zcat $sequence_file |") or die $!;
|
|
1086 }
|
|
1087 else{
|
|
1088 open (IN,$sequence_file) or die $!;
|
|
1089 }
|
|
1090
|
|
1091 my $count = 0;
|
|
1092
|
|
1093 warn "\nReading in the sequence file $sequence_file\n";
|
|
1094 while (1) {
|
|
1095 # last if ($counting{sequences_count} > 100);
|
|
1096 my $identifier = <IN>;
|
|
1097 my $sequence = <IN>;
|
|
1098 last unless ($identifier and $sequence);
|
|
1099
|
|
1100 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
|
|
1101
|
|
1102 ++$count;
|
|
1103
|
|
1104 if ($skip){
|
|
1105 next unless ($count > $skip);
|
|
1106 }
|
|
1107 if ($upto){
|
|
1108 last if ($count > $upto);
|
|
1109 }
|
|
1110
|
|
1111 $counting{sequences_count}++;
|
3
|
1112 if ($counting{sequences_count}%1000000==0) {
|
0
|
1113 warn "Processed $counting{sequences_count} sequences so far\n";
|
|
1114 }
|
|
1115 chomp $sequence;
|
|
1116 chomp $identifier;
|
|
1117
|
|
1118 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
|
|
1119
|
|
1120 my $return;
|
|
1121 if ($bowtie2){
|
|
1122 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
|
|
1123 }
|
|
1124 else{
|
|
1125 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
|
|
1126 }
|
|
1127
|
|
1128 unless ($return){
|
|
1129 $return = 0;
|
|
1130 }
|
|
1131
|
|
1132 # print the sequence to ambiguous.out if --ambiguous was specified
|
|
1133 if ($ambiguous and $return == 2){
|
|
1134 print AMBIG ">$identifier\n";
|
|
1135 print AMBIG "$sequence\n";
|
|
1136 }
|
|
1137
|
|
1138 # print the sequence to <unmapped.out> file if --un was specified
|
|
1139 elsif ($unmapped and $return == 1){
|
|
1140 print UNMAPPED ">$identifier\n";
|
|
1141 print UNMAPPED "$sequence\n";
|
|
1142 }
|
|
1143 }
|
|
1144 print "Processed $counting{sequences_count} sequences in total\n\n";
|
|
1145
|
|
1146 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
|
|
1147
|
|
1148 }
|
|
1149
|
|
1150 sub process_single_end_fastQ_file_for_methylation_call{
|
|
1151 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
|
|
1152 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
|
|
1153 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
|
|
1154 ### the C->T or G->A version
|
|
1155
|
|
1156 ### gzipped version of the infile
|
|
1157 if ($sequence_file =~ /\.gz$/){
|
|
1158 open (IN,"zcat $sequence_file |") or die $!;
|
|
1159 }
|
|
1160 else{
|
|
1161 open (IN,$sequence_file) or die $!;
|
|
1162 }
|
|
1163
|
|
1164 my $count = 0;
|
|
1165
|
|
1166 warn "\nReading in the sequence file $sequence_file\n";
|
|
1167 while (1) {
|
|
1168 my $identifier = <IN>;
|
|
1169 my $sequence = <IN>;
|
|
1170 my $identifier_2 = <IN>;
|
|
1171 my $quality_value = <IN>;
|
|
1172 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
|
|
1173
|
|
1174 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
|
|
1175
|
|
1176 ++$count;
|
|
1177
|
|
1178 if ($skip){
|
|
1179 next unless ($count > $skip);
|
|
1180 }
|
|
1181 if ($upto){
|
|
1182 last if ($count > $upto);
|
|
1183 }
|
|
1184
|
|
1185 $counting{sequences_count}++;
|
|
1186
|
|
1187 if ($counting{sequences_count}%1000000==0) {
|
|
1188 warn "Processed $counting{sequences_count} sequences so far\n";
|
|
1189 }
|
|
1190 chomp $sequence;
|
|
1191 chomp $identifier;
|
|
1192 chomp $quality_value;
|
|
1193
|
|
1194 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
|
|
1195
|
|
1196 my $return;
|
|
1197 if ($bowtie2){
|
|
1198 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
|
|
1199 }
|
|
1200 else{
|
|
1201 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
|
|
1202 }
|
|
1203
|
|
1204 unless ($return){
|
|
1205 $return = 0;
|
|
1206 }
|
|
1207
|
|
1208 # print the sequence to ambiguous.out if --ambiguous was specified
|
|
1209 if ($ambiguous and $return == 2){
|
|
1210 print AMBIG "\@$identifier\n";
|
|
1211 print AMBIG "$sequence\n";
|
|
1212 print AMBIG $identifier_2;
|
|
1213 print AMBIG "$quality_value\n";
|
|
1214 }
|
|
1215
|
|
1216 # print the sequence to <unmapped.out> file if --un was specified
|
|
1217 elsif ($unmapped and $return == 1){
|
|
1218 print UNMAPPED "\@$identifier\n";
|
|
1219 print UNMAPPED "$sequence\n";
|
|
1220 print UNMAPPED $identifier_2;
|
|
1221 print UNMAPPED "$quality_value\n";
|
|
1222 }
|
|
1223 }
|
|
1224 print "Processed $counting{sequences_count} sequences in total\n\n";
|
|
1225
|
|
1226 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
|
|
1227
|
|
1228 }
|
|
1229
|
|
1230 sub process_fastA_files_for_paired_end_methylation_calls{
|
|
1231 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
|
|
1232 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
|
|
1233 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
|
|
1234 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
|
|
1235 ### converted genomes (either the C->T or G->A version)
|
|
1236
|
|
1237 ### gzipped version of the infiles
|
|
1238 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
|
|
1239 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
|
|
1240 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
|
|
1241 }
|
|
1242 else{
|
|
1243 open (IN1,$sequence_file_1) or die $!;
|
|
1244 open (IN2,$sequence_file_2) or die $!;
|
|
1245 }
|
|
1246
|
|
1247 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
|
|
1248 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
|
|
1249
|
|
1250 my $count = 0;
|
|
1251
|
|
1252 while (1) {
|
|
1253 # reading from the first input file
|
|
1254 my $identifier_1 = <IN1>;
|
|
1255 my $sequence_1 = <IN1>;
|
|
1256 # reading from the second input file
|
|
1257 my $identifier_2 = <IN2>;
|
|
1258 my $sequence_2 = <IN2>;
|
|
1259 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
|
|
1260
|
|
1261 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
|
|
1262 $identifier_2 = fix_IDs($identifier_2);
|
|
1263
|
|
1264 ++$count;
|
|
1265
|
|
1266 if ($skip){
|
|
1267 next unless ($count > $skip);
|
|
1268 }
|
|
1269 if ($upto){
|
|
1270 last if ($count > $upto);
|
|
1271 }
|
|
1272
|
|
1273 $counting{sequences_count}++;
|
3
|
1274 if ($counting{sequences_count}%1000000==0) {
|
|
1275 warn "Processed $counting{sequences_count} sequence pairs so far\n";
|
0
|
1276 }
|
|
1277 my $orig_identifier_1 = $identifier_1;
|
|
1278 my $orig_identifier_2 = $identifier_2;
|
|
1279
|
|
1280 chomp $sequence_1;
|
|
1281 chomp $identifier_1;
|
|
1282 chomp $sequence_2;
|
|
1283 chomp $identifier_2;
|
|
1284
|
|
1285 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
|
|
1286
|
|
1287 my $return;
|
|
1288 if ($bowtie2){
|
|
1289 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
|
|
1290 }
|
|
1291 else{
|
|
1292 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
|
|
1293 }
|
|
1294
|
|
1295 unless ($return){
|
|
1296 $return = 0;
|
|
1297 }
|
|
1298
|
|
1299 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
|
|
1300 if ($ambiguous and $return == 2){
|
|
1301 print AMBIG_1 $orig_identifier_1;
|
|
1302 print AMBIG_1 "$sequence_1\n";
|
|
1303 print AMBIG_2 $orig_identifier_2;
|
|
1304 print AMBIG_2 "$sequence_2\n";
|
|
1305 }
|
|
1306
|
|
1307 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
|
|
1308 elsif ($unmapped and $return == 1){
|
|
1309 print UNMAPPED_1 $orig_identifier_1;
|
|
1310 print UNMAPPED_1 "$sequence_1\n";
|
|
1311 print UNMAPPED_2 $orig_identifier_2;
|
|
1312 print UNMAPPED_2 "$sequence_2\n";
|
|
1313 }
|
|
1314 }
|
|
1315
|
|
1316 warn "Processed $counting{sequences_count} sequences in total\n\n";
|
|
1317
|
|
1318 close OUT or die $!;
|
|
1319
|
|
1320 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
|
|
1321
|
|
1322 }
|
|
1323
|
|
1324 sub process_fastQ_files_for_paired_end_methylation_calls{
|
|
1325 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
|
|
1326 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
|
|
1327 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
|
|
1328 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
|
|
1329 ### of the converted genomes (either C->T or G->A version)
|
|
1330
|
|
1331 ### gzipped version of the infiles
|
|
1332 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
|
|
1333 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
|
|
1334 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
|
|
1335 }
|
|
1336 else{
|
|
1337 open (IN1,$sequence_file_1) or die $!;
|
|
1338 open (IN2,$sequence_file_2) or die $!;
|
|
1339 }
|
|
1340
|
|
1341 my $count = 0;
|
|
1342
|
|
1343 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
|
|
1344 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
|
|
1345 while (1) {
|
|
1346 # reading from the first input file
|
|
1347 my $identifier_1 = <IN1>;
|
|
1348 my $sequence_1 = <IN1>;
|
|
1349 my $ident_1 = <IN1>; # not needed
|
|
1350 my $quality_value_1 = <IN1>; # not needed
|
|
1351 # reading from the second input file
|
|
1352 my $identifier_2 = <IN2>;
|
|
1353 my $sequence_2 = <IN2>;
|
|
1354 my $ident_2 = <IN2>; # not needed
|
|
1355 my $quality_value_2 = <IN2>; # not needed
|
|
1356 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
|
|
1357
|
|
1358 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
|
|
1359 $identifier_2 = fix_IDs($identifier_2);
|
|
1360
|
|
1361 ++$count;
|
|
1362
|
|
1363 if ($skip){
|
|
1364 next unless ($count > $skip);
|
|
1365 }
|
|
1366 if ($upto){
|
|
1367 last if ($count > $upto);
|
|
1368 }
|
|
1369
|
|
1370 $counting{sequences_count}++;
|
3
|
1371 if ($counting{sequences_count}%1000000==0) {
|
|
1372 warn "Processed $counting{sequences_count} sequence pairs so far\n";
|
0
|
1373 }
|
|
1374
|
|
1375 my $orig_identifier_1 = $identifier_1;
|
|
1376 my $orig_identifier_2 = $identifier_2;
|
|
1377
|
|
1378 chomp $sequence_1;
|
|
1379 chomp $identifier_1;
|
|
1380 chomp $sequence_2;
|
|
1381 chomp $identifier_2;
|
|
1382 chomp $quality_value_1;
|
|
1383 chomp $quality_value_2;
|
|
1384
|
|
1385 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
|
|
1386
|
|
1387 my $return;
|
|
1388 if ($bowtie2){
|
|
1389 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
|
|
1390 }
|
|
1391 else{
|
|
1392 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
|
|
1393 }
|
|
1394
|
|
1395 unless ($return){
|
|
1396 $return = 0;
|
|
1397 }
|
|
1398
|
|
1399 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
|
|
1400 if ($ambiguous and $return == 2){
|
|
1401 # seq_1
|
|
1402 print AMBIG_1 $orig_identifier_1;
|
|
1403 print AMBIG_1 "$sequence_1\n";
|
|
1404 print AMBIG_1 $ident_1;
|
|
1405 print AMBIG_1 "$quality_value_1\n";
|
|
1406 # seq_2
|
|
1407 print AMBIG_2 $orig_identifier_2;
|
|
1408 print AMBIG_2 "$sequence_2\n";
|
|
1409 print AMBIG_2 $ident_2;
|
|
1410 print AMBIG_2 "$quality_value_2\n";
|
|
1411 }
|
|
1412
|
|
1413 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
|
|
1414 elsif ($unmapped and $return == 1){
|
|
1415 # seq_1
|
|
1416 print UNMAPPED_1 $orig_identifier_1;
|
|
1417 print UNMAPPED_1 "$sequence_1\n";
|
|
1418 print UNMAPPED_1 $ident_1;
|
|
1419 print UNMAPPED_1 "$quality_value_1\n";
|
|
1420 # seq_2
|
|
1421 print UNMAPPED_2 $orig_identifier_2;
|
|
1422 print UNMAPPED_2 "$sequence_2\n";
|
|
1423 print UNMAPPED_2 $ident_2;
|
|
1424 print UNMAPPED_2 "$quality_value_2\n";
|
|
1425 }
|
|
1426 }
|
|
1427
|
|
1428 warn "Processed $counting{sequences_count} sequences in total\n\n";
|
|
1429
|
|
1430 close OUT or die $!;
|
|
1431
|
|
1432 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
|
|
1433
|
|
1434 }
|
|
1435
|
|
1436 sub check_bowtie_results_single_end{
|
|
1437 my ($sequence,$identifier,$quality_value) = @_;
|
|
1438
|
|
1439 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
|
|
1440 $quality_value = 'I'x(length$sequence);
|
|
1441 }
|
|
1442
|
|
1443 my %mismatches = ();
|
|
1444 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
|
|
1445 foreach my $index (0..$#fhs){
|
|
1446
|
|
1447 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
|
|
1448 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
|
|
1449 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
|
|
1450 if ($fhs[$index]->{last_seq_id} eq $identifier) {
|
|
1451 ###############################################################
|
|
1452 ### STEP I Now processing the alignment stored in last_line ###
|
|
1453 ###############################################################
|
|
1454 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
|
|
1455 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
|
|
1456 ### we only continue to extract useful information about this alignment if 1 was returned
|
|
1457 if ($valid_alignment_found_1 == 1){
|
|
1458 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
|
|
1459 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
|
|
1460 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
|
|
1461
|
|
1462 unless($mismatch_info){
|
|
1463 $mismatch_info = '';
|
|
1464 }
|
|
1465
|
|
1466 chomp $mismatch_info;
|
|
1467 my $chromosome;
|
|
1468 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
|
|
1469 $chromosome = $mapped_chromosome;
|
|
1470 }
|
|
1471 else{
|
|
1472 die "Chromosome number extraction failed for $mapped_chromosome\n";
|
|
1473 }
|
|
1474 ### Now extracting the number of mismatches to the converted genome
|
|
1475 my $number_of_mismatches;
|
|
1476 if ($mismatch_info eq ''){
|
|
1477 $number_of_mismatches = 0;
|
|
1478 }
|
|
1479 elsif ($mismatch_info =~ /^\d/){
|
|
1480 my @mismatches = split (/,/,$mismatch_info);
|
|
1481 $number_of_mismatches = scalar @mismatches;
|
|
1482 }
|
|
1483 else{
|
|
1484 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
|
|
1485 }
|
|
1486 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
|
|
1487 my $alignment_location = join (":",$chromosome,$position);
|
|
1488 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
|
|
1489 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
|
|
1490 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
|
|
1491 ### number for the found alignment)
|
|
1492 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
|
|
1493 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
|
|
1494 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
|
|
1495 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
|
|
1496 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
|
|
1497 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
|
|
1498 }
|
|
1499 $number_of_mismatches = undef;
|
|
1500 ##################################################################################################################################################
|
|
1501 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
|
|
1502 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
|
|
1503 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
|
|
1504 ##################################################################################################################################################
|
|
1505 my $newline = $fhs[$index]->{fh}-> getline();
|
|
1506 if ($newline){
|
|
1507 my ($seq_id) = split (/\t/,$newline);
|
|
1508 $fhs[$index]->{last_seq_id} = $seq_id;
|
|
1509 $fhs[$index]->{last_line} = $newline;
|
|
1510 }
|
|
1511 else {
|
|
1512 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
|
|
1513 $fhs[$index]->{last_seq_id} = undef;
|
|
1514 $fhs[$index]->{last_line} = undef;
|
|
1515 next;
|
|
1516 }
|
|
1517 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
|
|
1518 ### we only continue to extract useful information about this second alignment if 1 was returned
|
|
1519 if ($valid_alignment_found_2 == 1){
|
|
1520 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
|
|
1521 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
|
|
1522 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
|
|
1523 unless($mismatch_info){
|
|
1524 $mismatch_info = '';
|
|
1525 }
|
|
1526 chomp $mismatch_info;
|
|
1527
|
|
1528 my $chromosome;
|
|
1529 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
|
|
1530 $chromosome = $mapped_chromosome;
|
|
1531 }
|
|
1532 else{
|
|
1533 die "Chromosome number extraction failed for $mapped_chromosome\n";
|
|
1534 }
|
|
1535
|
|
1536 ### Now extracting the number of mismatches to the converted genome
|
|
1537 my $number_of_mismatches;
|
|
1538 if ($mismatch_info eq ''){
|
|
1539 $number_of_mismatches = 0;
|
|
1540 }
|
|
1541 elsif ($mismatch_info =~ /^\d/){
|
|
1542 my @mismatches = split (/,/,$mismatch_info);
|
|
1543 $number_of_mismatches = scalar @mismatches;
|
|
1544 }
|
|
1545 else{
|
|
1546 die "Something weird is going on with the mismatch field\n";
|
|
1547 }
|
|
1548 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
|
|
1549 ### extracting the chromosome number from the bowtie output (see above)
|
|
1550 my $alignment_location = join (":",$chromosome,$position);
|
|
1551 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
|
|
1552 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
|
|
1553 ### case we are not writing the same entry out a second time.
|
|
1554 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
|
|
1555 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
|
|
1556 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
|
|
1557 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
|
|
1558 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
|
|
1559 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
|
|
1560 }
|
|
1561 ####################################################################################################################################
|
|
1562 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
|
|
1563 ####################################################################################################################################
|
|
1564 $newline = $fhs[$index]->{fh}-> getline();
|
|
1565 if ($newline){
|
|
1566 my ($seq_id) = split (/\t/,$newline);
|
|
1567 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
|
|
1568 $fhs[$index]->{last_seq_id} = $seq_id;
|
|
1569 $fhs[$index]->{last_line} = $newline;
|
|
1570 next;
|
|
1571 }
|
|
1572 else {
|
|
1573 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
|
|
1574 $fhs[$index]->{last_seq_id} = undef;
|
|
1575 $fhs[$index]->{last_line} = undef;
|
|
1576 next;
|
|
1577 }
|
|
1578 ### still within the 2nd sequence in correct orientation found
|
|
1579 }
|
|
1580 ### still withing the 1st sequence in correct orientation found
|
|
1581 }
|
|
1582 ### still within the if (last_seq_id eq identifier) condition
|
|
1583 }
|
|
1584 ### still within foreach index loop
|
|
1585 }
|
|
1586 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
|
|
1587 unless(%mismatches){
|
|
1588 $counting{no_single_alignment_found}++;
|
|
1589 if ($unmapped){
|
|
1590 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
|
|
1591 }
|
|
1592 else{
|
|
1593 return;
|
|
1594 }
|
|
1595 }
|
|
1596 #######################################################################################################################################################
|
|
1597 #######################################################################################################################################################
|
|
1598 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
|
|
1599 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
|
|
1600 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
|
|
1601 #######################################################################################################################################################
|
|
1602 #######################################################################################################################################################
|
|
1603 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
|
|
1604 my $sequence_fails = 0;
|
|
1605 ### Declaring an empty hash reference which will store all information we need for the methylation call
|
|
1606 my $methylation_call_params; # hash reference!
|
|
1607 ### sorting in ascending order
|
|
1608 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
|
|
1609
|
|
1610 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
|
|
1611 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
|
|
1612 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
|
|
1613 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
|
|
1614 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
|
|
1615 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
|
|
1616 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
|
|
1617 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
|
|
1618 }
|
|
1619 }
|
|
1620 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
|
|
1621 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
|
|
1622 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
|
|
1623 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
|
|
1624 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
|
|
1625 ### reaction. E.g.
|
|
1626 ### CAGTCACGCGCGCGCG will become
|
|
1627 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
|
|
1628 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
|
|
1629 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
|
|
1630 ### G->A conversion:
|
|
1631 ### highly methylated: CAATCACACACACACA
|
|
1632 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
|
|
1633 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
|
|
1634 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
|
|
1635 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
|
|
1636 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
|
|
1637 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
|
|
1638 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
|
|
1639 ### In the above example the number of transliterations required to transform the actual sequence
|
|
1640 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
|
|
1641 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
|
|
1642 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
|
|
1643 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
|
|
1644 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
|
|
1645 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
|
|
1646 my @three_candidate_seqs;
|
|
1647 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
|
|
1648 my $transliterations_performed;
|
|
1649 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
|
|
1650 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
|
|
1651 }
|
|
1652 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
|
|
1653 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
|
|
1654 }
|
|
1655 else{
|
|
1656 die "unexpected index number range $!\n";
|
|
1657 }
|
|
1658 push @three_candidate_seqs,{
|
|
1659 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
|
|
1660 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
|
|
1661 mismatch_number => $mismatch_number,
|
|
1662 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
|
|
1663 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
|
|
1664 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
|
|
1665 transliterations_performed => $transliterations_performed,
|
|
1666 };
|
|
1667 }
|
|
1668 ### sorting in ascending order for the lowest number of transliterations performed
|
|
1669 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
|
|
1670 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
|
|
1671 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
|
|
1672 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
|
|
1673 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
|
|
1674 if (($first_array_element*2) < $second_array_element){
|
|
1675 $counting{low_complexity_alignments_overruled_count}++;
|
|
1676 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
|
|
1677 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
|
|
1678 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
|
|
1679 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
|
|
1680 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
|
|
1681 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
|
|
1682 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
|
|
1683 }
|
|
1684 else{
|
|
1685 $sequence_fails = 1;
|
|
1686 }
|
|
1687 }
|
|
1688 else{
|
|
1689 $sequence_fails = 1;
|
|
1690 }
|
|
1691 ### after processing the alignment with the lowest number of mismatches we exit
|
|
1692 last;
|
|
1693 }
|
|
1694 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
|
|
1695 if ($sequence_fails == 1){
|
|
1696 $counting{unsuitable_sequence_count}++;
|
|
1697 if ($ambiguous){
|
|
1698 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
|
|
1699 }
|
|
1700 if ($unmapped){
|
|
1701 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
|
|
1702 }
|
|
1703 else{
|
|
1704 return 0; # => exits to next sequence (default)
|
|
1705 }
|
|
1706 }
|
|
1707
|
|
1708 ### --DIRECTIONAL
|
|
1709 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
|
|
1710 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
|
|
1711 if ($directional){
|
|
1712 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
|
|
1713 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
|
|
1714 $counting{alignments_rejected_count}++;
|
|
1715 return 0;
|
|
1716 }
|
|
1717 }
|
|
1718
|
|
1719 ### If the sequence has not been rejected so far it will have a unique best alignment
|
|
1720 $counting{unique_best_alignment_count}++;
|
|
1721 if ($pbat){
|
|
1722 extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params);
|
|
1723 }
|
|
1724 else{
|
|
1725 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
|
|
1726 }
|
|
1727
|
|
1728 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
|
|
1729 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
|
|
1730 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
|
|
1731 $counting{genomic_sequence_could_not_be_extracted_count}++;
|
|
1732 return 0;
|
|
1733 }
|
|
1734
|
|
1735 ### otherwise we are set to perform the actual methylation call
|
|
1736 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
|
|
1737
|
|
1738 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
|
|
1739 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
|
|
1740 }
|
|
1741
|
|
1742 sub check_bowtie_results_single_end_bowtie2{
|
|
1743 my ($sequence,$identifier,$quality_value) = @_;
|
|
1744
|
|
1745
|
|
1746 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
|
|
1747 $quality_value = 'I'x(length$sequence);
|
|
1748 }
|
|
1749
|
|
1750 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
|
|
1751 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
|
|
1752 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n";
|
|
1753
|
|
1754 my $alignment_ambiguous = 0;
|
|
1755
|
|
1756 my %alignments = ();
|
|
1757
|
|
1758 ### reading from the Bowtie 2 output filehandles
|
|
1759 foreach my $index (0..$#fhs){
|
|
1760 # print "Index: $index\n";
|
|
1761 # print "$fhs[$index]->{last_line}\n";
|
|
1762 # print "$fhs[$index]->{last_seq_id}\n";
|
|
1763 # sleep (1);
|
|
1764 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
|
|
1765 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
|
|
1766
|
|
1767 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
|
|
1768 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
|
|
1769
|
|
1770 if ($fhs[$index]->{last_seq_id} eq $identifier) {
|
|
1771 # SAM format specifications for Bowtie 2
|
|
1772 # (1) Name of read that aligned
|
|
1773 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
|
|
1774 # 1 The read is one of a pair
|
|
1775 # 2 The alignment is one end of a proper paired-end alignment
|
|
1776 # 4 The read has no reported alignments
|
|
1777 # 8 The read is one of a pair and has no reported alignments
|
|
1778 # 16 The alignment is to the reverse reference strand
|
|
1779 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
|
|
1780 # 64 The read is mate 1 in a pair
|
|
1781 # 128 The read is mate 2 in a pair
|
|
1782 # 256 The read has multiple mapping states
|
|
1783 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
|
|
1784 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
|
|
1785 # (5) Mapping quality (255 means MAPQ is not available)
|
|
1786 # (6) CIGAR string representation of alignment (* if unavailable)
|
|
1787 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
|
|
1788 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
|
|
1789 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
|
|
1790 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
|
|
1791 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
|
|
1792 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
|
|
1793 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
|
|
1794 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
|
|
1795 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
|
|
1796 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
|
|
1797 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
|
|
1798 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
|
|
1799 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
|
|
1800 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
|
|
1801 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
|
|
1802 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
|
|
1803
|
|
1804 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
|
|
1805
|
|
1806 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
|
|
1807 if ($flag == 4){
|
|
1808 ## reading in the next alignment, which must be the next sequence
|
|
1809 my $newline = $fhs[$index]->{fh}-> getline();
|
|
1810 if ($newline){
|
|
1811 chomp $newline;
|
|
1812 my ($seq_id) = split (/\t/,$newline);
|
|
1813 $fhs[$index]->{last_seq_id} = $seq_id;
|
|
1814 $fhs[$index]->{last_line} = $newline;
|
|
1815 if ($seq_id eq $identifier){
|
|
1816 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
|
|
1817 }
|
|
1818 next; # next instance
|
|
1819 }
|
|
1820 else{
|
|
1821 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
|
|
1822 $fhs[$index]->{last_seq_id} = undef;
|
|
1823 $fhs[$index]->{last_line} = undef;
|
|
1824 next;
|
|
1825 }
|
|
1826 }
|
|
1827
|
|
1828 # if there are one or more proper alignments we can extract the chromosome number
|
|
1829 my $chromosome;
|
|
1830 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
|
|
1831 $chromosome = $mapped_chromosome;
|
|
1832 }
|
|
1833 else{
|
|
1834 die "Chromosome number extraction failed for $mapped_chromosome\n";
|
|
1835 }
|
|
1836
|
|
1837 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
|
|
1838 my ($alignment_score,$second_best,$MD_tag);
|
|
1839 my @fields = split (/\t/,$fhs[$index]->{last_line});
|
|
1840
|
|
1841 foreach (11..$#fields){
|
|
1842 if ($fields[$_] =~ /AS:i:(.*)/){
|
|
1843 $alignment_score = $1;
|
|
1844 }
|
|
1845 elsif ($fields[$_] =~ /XS:i:(.*)/){
|
|
1846 $second_best = $1;
|
|
1847 }
|
|
1848 elsif ($fields[$_] =~ /MD:Z:(.*)/){
|
|
1849 $MD_tag = $1;
|
|
1850 }
|
|
1851 }
|
|
1852
|
|
1853 # warn "First best alignment_score is: '$alignment_score'\n";
|
|
1854 # warn "MD tag is: '$MD_tag'\n";
|
|
1855 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
|
|
1856
|
|
1857 if (defined $second_best){
|
|
1858 # warn "second best alignment_score is: '$second_best'\n\n";
|
|
1859
|
|
1860 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
|
|
1861 if ($alignment_score == $second_best){
|
|
1862 $alignment_ambiguous = 1;
|
|
1863 ## need to read and discard all additional ambiguous reads until we reach the next sequence
|
|
1864 until ($fhs[$index]->{last_seq_id} ne $identifier){
|
|
1865 my $newline = $fhs[$index]->{fh}-> getline();
|
|
1866 if ($newline){
|
|
1867 chomp $newline;
|
|
1868 my ($seq_id) = split (/\t/,$newline);
|
|
1869 $fhs[$index]->{last_seq_id} = $seq_id;
|
|
1870 $fhs[$index]->{last_line} = $newline;
|
|
1871 }
|
|
1872 else{
|
|
1873 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
|
|
1874 $fhs[$index]->{last_seq_id} = undef;
|
|
1875 $fhs[$index]->{last_line} = undef;
|
|
1876 last; # break free in case we have reached the end of the alignment output
|
|
1877 }
|
|
1878 }
|
|
1879 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
|
|
1880 }
|
|
1881 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
|
|
1882
|
|
1883 my $alignment_location = join (":",$chromosome,$position);
|
|
1884
|
|
1885 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
|
|
1886 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
|
|
1887 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
|
|
1888 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
|
|
1889
|
|
1890 unless (exists $alignments{$alignment_location}){
|
|
1891 $alignments{$alignment_location}->{seq_id} = $id;
|
|
1892 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
|
|
1893 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
|
|
1894 $alignments{$alignment_location}->{index} = $index;
|
|
1895 $alignments{$alignment_location}->{chromosome} = $chromosome;
|
|
1896 $alignments{$alignment_location}->{position} = $position;
|
|
1897 $alignments{$alignment_location}->{CIGAR} = $cigar;
|
|
1898 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
|
|
1899 }
|
|
1900
|
|
1901 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
|
|
1902 until ($fhs[$index]->{last_seq_id} ne $identifier){
|
|
1903 my $newline = $fhs[$index]->{fh}-> getline();
|
|
1904 if ($newline){
|
|
1905 chomp $newline;
|
|
1906 my ($seq_id) = split (/\t/,$newline);
|
|
1907 $fhs[$index]->{last_seq_id} = $seq_id;
|
|
1908 $fhs[$index]->{last_line} = $newline;
|
|
1909 }
|
|
1910 else{
|
|
1911 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
|
|
1912 $fhs[$index]->{last_seq_id} = undef;
|
|
1913 $fhs[$index]->{last_line} = undef;
|
|
1914 last; # break free in case we have reached the end of the alignment output
|
|
1915 }
|
|
1916 }
|
|
1917 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
|
|
1918 }
|
|
1919 }
|
|
1920 else{ # there is no second best hit, so we can just store this one and read in the next sequence
|
|
1921
|
|
1922 my $alignment_location = join (":",$chromosome,$position);
|
|
1923
|
|
1924 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
|
|
1925 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
|
|
1926 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
|
|
1927 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
|
|
1928
|
|
1929 unless (exists $alignments{$alignment_location}){
|
|
1930 $alignments{$alignment_location}->{seq_id} = $id;
|
|
1931 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
|
|
1932 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
|
|
1933 $alignments{$alignment_location}->{index} = $index;
|
|
1934 $alignments{$alignment_location}->{chromosome} = $chromosome;
|
|
1935 $alignments{$alignment_location}->{position} = $position;
|
|
1936 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
|
|
1937 $alignments{$alignment_location}->{CIGAR} = $cigar;
|
|
1938 }
|
|
1939
|
|
1940 my $newline = $fhs[$index]->{fh}-> getline();
|
|
1941 if ($newline){
|
|
1942 chomp $newline;
|
|
1943 my ($seq_id) = split (/\t/,$newline);
|
|
1944 $fhs[$index]->{last_seq_id} = $seq_id;
|
|
1945 $fhs[$index]->{last_line} = $newline;
|
|
1946 if ($seq_id eq $identifier){
|
|
1947 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
|
|
1948 }
|
|
1949 }
|
|
1950 else{
|
|
1951 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
|
|
1952 $fhs[$index]->{last_seq_id} = undef;
|
|
1953 $fhs[$index]->{last_line} = undef;
|
|
1954 }
|
|
1955 }
|
|
1956 }
|
|
1957 }
|
|
1958
|
|
1959 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
|
|
1960 if ($alignment_ambiguous == 1){
|
|
1961 $counting{unsuitable_sequence_count}++;
|
|
1962 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
|
|
1963 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
|
|
1964 # print "$ambiguous_read_output\n";
|
|
1965
|
|
1966 if ($ambiguous){
|
|
1967 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
|
|
1968 }
|
|
1969 elsif ($unmapped){
|
|
1970 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
|
|
1971 }
|
|
1972 else{
|
|
1973 return 0;
|
|
1974 }
|
|
1975 }
|
|
1976
|
|
1977 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
|
|
1978 unless(%alignments){
|
|
1979 $counting{no_single_alignment_found}++;
|
|
1980 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
|
|
1981 # print "$unmapped_read_output\n";
|
|
1982 if ($unmapped){
|
|
1983 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
|
|
1984 }
|
|
1985 else{
|
|
1986 return 0; # default
|
|
1987 }
|
|
1988 }
|
|
1989
|
|
1990 #######################################################################################################################################################
|
|
1991
|
|
1992 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
|
|
1993 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
|
|
1994 ### alignment score we are discarding the sequence altogether.
|
|
1995 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
|
|
1996 ### opening (5) and extending (3 per bp) the gap.
|
|
1997
|
|
1998 #######################################################################################################################################################
|
|
1999
|
|
2000 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
|
|
2001 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
|
|
2002
|
|
2003 ### print contents of %alignments for debugging
|
|
2004 # if (scalar keys %alignments > 1){
|
|
2005 # print "\n******\n";
|
|
2006 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
|
|
2007 # print "Loc: $alignment_location\n";
|
|
2008 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
|
|
2009 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
|
|
2010 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
|
|
2011 # print "Index $alignments{$alignment_location}->{index}\n";
|
|
2012 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
|
|
2013 # print "pos: $alignments{$alignment_location}->{position}\n";
|
|
2014 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
|
|
2015 # }
|
|
2016 # print "\n******\n";
|
|
2017 # }
|
|
2018
|
|
2019 ### if there is only 1 entry in the hash with we accept it as the best alignment
|
|
2020 if (scalar keys %alignments == 1){
|
|
2021 for my $unique_best_alignment (keys %alignments){
|
|
2022 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
|
|
2023 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
|
|
2024 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
|
|
2025 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
|
|
2026 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
|
|
2027 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
|
|
2028 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
|
|
2029 }
|
|
2030 }
|
|
2031
|
|
2032 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
|
|
2033 ### we boot the sequence altogether
|
|
2034 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
|
|
2035 my $best_alignment_score;
|
|
2036 my $best_alignment_location;
|
|
2037 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
|
|
2038 # print "$alignments{$alignment_location}->{alignment_score}\n";
|
|
2039 unless (defined $best_alignment_score){
|
|
2040 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
|
|
2041 $best_alignment_location = $alignment_location;
|
|
2042 # print "setting best alignment score: $best_alignment_score\n";
|
|
2043 }
|
|
2044 else{
|
|
2045 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
|
|
2046 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
|
|
2047 # warn "Same alignment score, the sequence will get booted!\n";
|
|
2048 $sequence_fails = 1;
|
|
2049 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
|
|
2050 }
|
|
2051 ### else we are going to store the best alignment for further processing
|
|
2052 else{
|
|
2053 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
|
|
2054 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
|
|
2055 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
|
|
2056 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
|
|
2057 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
|
|
2058 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
|
|
2059 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
|
|
2060 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
|
|
2061 }
|
|
2062 }
|
|
2063 }
|
|
2064 }
|
|
2065 else{
|
|
2066 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
|
|
2067 }
|
|
2068
|
|
2069 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
|
|
2070 if ($sequence_fails == 1){
|
|
2071 $counting{unsuitable_sequence_count}++;
|
|
2072
|
|
2073 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
|
|
2074 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
|
|
2075 # print OUT "$ambiguous_read_output\n";
|
|
2076
|
|
2077 if ($ambiguous){
|
|
2078 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
|
|
2079 }
|
|
2080 elsif ($unmapped){
|
|
2081 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
|
|
2082 }
|
|
2083 else{
|
|
2084 return 0; # => exits to next sequence (default)
|
|
2085 }
|
|
2086 }
|
|
2087
|
|
2088 ### --DIRECTIONAL
|
|
2089 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
|
|
2090 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
|
|
2091 if ($directional){
|
|
2092 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
|
|
2093 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
|
|
2094 $counting{alignments_rejected_count}++;
|
|
2095 return 0;
|
|
2096 }
|
|
2097 }
|
|
2098
|
|
2099 ### If the sequence has not been rejected so far it has a unique best alignment
|
|
2100 $counting{unique_best_alignment_count}++;
|
|
2101
|
|
2102 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
|
|
2103 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
|
|
2104
|
|
2105 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
|
|
2106 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
|
|
2107 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
|
|
2108 $counting{genomic_sequence_could_not_be_extracted_count}++;
|
|
2109 return 0;
|
|
2110 }
|
|
2111
|
|
2112
|
|
2113 ### otherwise we are set to perform the actual methylation call
|
|
2114 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
|
|
2115 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
|
|
2116 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
|
|
2117 }
|
|
2118
|
|
2119
|
|
2120 sub determine_number_of_transliterations_performed{
|
|
2121 my ($sequence,$read_conversion) = @_;
|
|
2122 my $number_of_transliterations;
|
|
2123 if ($read_conversion eq 'CT'){
|
|
2124 $number_of_transliterations = $sequence =~ tr/C/T/;
|
|
2125 }
|
|
2126 elsif ($read_conversion eq 'GA'){
|
|
2127 $number_of_transliterations = $sequence =~ tr/G/A/;
|
|
2128 }
|
|
2129 else{
|
|
2130 die "Read conversion mode of the read was not specified $!\n";
|
|
2131 }
|
|
2132 return $number_of_transliterations;
|
|
2133 }
|
|
2134
|
|
2135 sub decide_whether_single_end_alignment_is_valid{
|
|
2136 my ($index,$identifier) = @_;
|
|
2137
|
|
2138 # extracting from Bowtie 1 format
|
|
2139 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
|
|
2140
|
|
2141 ### ensuring that the entry is the correct sequence
|
|
2142 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
|
|
2143 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
|
|
2144 ### sensible alignments
|
|
2145 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
|
|
2146 ### If the orientation was correct can we move on
|
|
2147 if ($orientation == 1){
|
|
2148 return 1; ### 1st possibility for a sequence to pass
|
|
2149 }
|
|
2150 ### If the alignment was in the wrong orientation we need to read in a new line
|
|
2151 elsif($orientation == 0){
|
|
2152 my $newline = $fhs[$index]->{fh}->getline();
|
|
2153 if ($newline){
|
|
2154 ($id,$strand) = (split (/\t/,$newline))[0,1];
|
|
2155
|
|
2156 ### ensuring that the next entry is still the correct sequence
|
|
2157 if ($id eq $identifier){
|
|
2158 ### checking orientation again
|
|
2159 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
|
|
2160 ### If the orientation was correct can we move on
|
|
2161 if ($orientation == 1){
|
|
2162 $fhs[$index]->{last_seq_id} = $id;
|
|
2163 $fhs[$index]->{last_line} = $newline;
|
|
2164 return 1; ### 2nd possibility for a sequence to pass
|
|
2165 }
|
|
2166 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
|
|
2167 elsif ($orientation == 0){
|
|
2168 $newline = $fhs[$index]->{fh}->getline();
|
|
2169 if ($newline){
|
|
2170 my ($seq_id) = split (/\t/,$newline);
|
|
2171 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
|
|
2172 ### the same fields of the just read next entry
|
|
2173 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
|
|
2174 $fhs[$index]->{last_seq_id} = $seq_id;
|
|
2175 $fhs[$index]->{last_line} = $newline;
|
|
2176 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
|
|
2177 }
|
|
2178 else{
|
|
2179 # assigning undef to last_seq_id and last_line (end of bowtie output)
|
|
2180 $fhs[$index]->{last_seq_id} = undef;
|
|
2181 $fhs[$index]->{last_line} = undef;
|
|
2182 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
|
|
2183 }
|
|
2184 }
|
|
2185 else{
|
|
2186 die "The orientation of the alignment must be either correct or incorrect\n";
|
|
2187 }
|
|
2188 }
|
|
2189 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
|
|
2190 else{
|
|
2191 $fhs[$index]->{last_seq_id} = $id;
|
|
2192 $fhs[$index]->{last_line} = $newline;
|
|
2193 return 0; # processing the new alignment result only in the next round
|
|
2194 }
|
|
2195 }
|
|
2196 else {
|
|
2197 # assigning undef to last_seq_id and last_line (end of bowtie output)
|
|
2198 $fhs[$index]->{last_seq_id} = undef;
|
|
2199 $fhs[$index]->{last_line} = undef;
|
|
2200 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
|
|
2201 }
|
|
2202 }
|
|
2203 else{
|
|
2204 die "The orientation of the alignment must be either correct or incorrect\n";
|
|
2205 }
|
|
2206 }
|
|
2207 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
|
|
2208 else{
|
|
2209 return 0;
|
|
2210 }
|
|
2211 }
|
|
2212 #########################
|
|
2213 ### BOWTIE 1 | PAIRED-END
|
|
2214 #########################
|
|
2215
|
|
2216 sub check_bowtie_results_paired_ends{
|
|
2217 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
|
|
2218
|
|
2219 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
|
|
2220 unless ($quality_value_1){
|
|
2221 $quality_value_1 = 'I'x(length$sequence_1);
|
|
2222 }
|
|
2223 unless ($quality_value_2){
|
|
2224 $quality_value_2 = 'I'x(length$sequence_2);
|
|
2225 }
|
|
2226
|
|
2227 # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
|
|
2228 # sleep (1);
|
|
2229 my %mismatches = ();
|
|
2230 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
|
|
2231
|
|
2232
|
|
2233 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
|
|
2234 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
|
|
2235 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
|
|
2236 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
|
|
2237 ### strands are not being reported by specifying --directional
|
|
2238
|
|
2239 foreach my $index (0,3,1,2){
|
|
2240 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
|
|
2241 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
|
|
2242 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
|
|
2243 if ($fhs[$index]->{last_seq_id} eq $identifier) {
|
|
2244 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
|
|
2245
|
|
2246 ##################################################################################
|
|
2247 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
|
|
2248 ##################################################################################
|
|
2249 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
|
|
2250 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
|
|
2251 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
|
|
2252 if ($valid_alignment_found == 1){
|
|
2253 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
|
|
2254 ### we store the useful information in %mismatches
|
|
2255 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
|
|
2256 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
|
|
2257 chomp $mismatch_info_1;
|
|
2258 chomp $mismatch_info_2;
|
|
2259
|
|
2260 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
|
|
2261 my ($chromosome_1,$chromosome_2);
|
|
2262 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
|
|
2263 $chromosome_1 = $mapped_chromosome_1;
|
|
2264 }
|
|
2265 else{
|
|
2266 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
|
|
2267 }
|
|
2268 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
|
|
2269 $chromosome_2 = $mapped_chromosome_2;
|
|
2270 }
|
|
2271 else{
|
|
2272 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
|
|
2273 }
|
|
2274
|
|
2275 ### Now extracting the number of mismatches to the converted genome
|
|
2276 my $number_of_mismatches_1;
|
|
2277 my $number_of_mismatches_2;
|
|
2278 if ($mismatch_info_1 eq ''){
|
|
2279 $number_of_mismatches_1 = 0;
|
|
2280 }
|
|
2281 elsif ($mismatch_info_1 =~ /^\d/){
|
|
2282 my @mismatches = split (/,/,$mismatch_info_1);
|
|
2283 $number_of_mismatches_1 = scalar @mismatches;
|
|
2284 }
|
|
2285 else{
|
|
2286 die "Something weird is going on with the mismatch field\n";
|
|
2287 }
|
|
2288 if ($mismatch_info_2 eq ''){
|
|
2289 $number_of_mismatches_2 = 0;
|
|
2290 }
|
|
2291 elsif ($mismatch_info_2 =~ /^\d/){
|
|
2292 my @mismatches = split (/,/,$mismatch_info_2);
|
|
2293 $number_of_mismatches_2 = scalar @mismatches;
|
|
2294 }
|
|
2295 else{
|
|
2296 die "Something weird is going on with the mismatch field\n";
|
|
2297 }
|
|
2298 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
|
|
2299 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
|
|
2300 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
|
|
2301 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
|
|
2302 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
|
|
2303 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
|
|
2304 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
|
|
2305 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
|
|
2306 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
|
|
2307 ### number for the found alignment)
|
|
2308 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
|
|
2309 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
|
|
2310 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
|
|
2311 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
|
|
2312 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
|
|
2313 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
|
|
2314 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
|
|
2315 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
|
|
2316 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
|
|
2317 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
|
|
2318 }
|
|
2319 ###################################################################################################################################################
|
|
2320 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
|
|
2321 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
|
|
2322 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
|
|
2323 ### this round ###
|
|
2324 ###################################################################################################################################################
|
|
2325 my $newline_1 = $fhs[$index]->{fh}-> getline();
|
|
2326 my $newline_2 = $fhs[$index]->{fh}-> getline();
|
|
2327
|
|
2328 if ($newline_1 and $newline_2){
|
|
2329 my ($seq_id_1) = split (/\t/,$newline_1);
|
|
2330 my ($seq_id_2) = split (/\t/,$newline_2);
|
|
2331
|
|
2332 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
|
|
2333 $fhs[$index]->{last_seq_id} = $seq_id_1;
|
|
2334 }
|
|
2335 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
|
|
2336 $fhs[$index]->{last_seq_id} = $seq_id_2;
|
|
2337 }
|
|
2338 else{
|
|
2339 die "Either read 1 or read 2 needs to end on '/1'\n";
|
|
2340 }
|
|
2341
|
|
2342 $fhs[$index]->{last_line_1} = $newline_1;
|
|
2343 $fhs[$index]->{last_line_2} = $newline_2;
|
|
2344 }
|
|
2345 else {
|
|
2346 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
|
|
2347 $fhs[$index]->{last_seq_id} = undef;
|
|
2348 $fhs[$index]->{last_line_1} = undef;
|
|
2349 $fhs[$index]->{last_line_2} = undef;
|
|
2350 next; # jumping to the next index
|
|
2351 }
|
|
2352 ### Now processing the entry we just stored in last_line_1 and last_line_2
|
|
2353 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
|
|
2354 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
|
|
2355 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
|
|
2356 if ($valid_alignment_found == 1){
|
|
2357 ### we store the useful information in %mismatches
|
|
2358 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
|
|
2359 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
|
|
2360 chomp $mismatch_info_1;
|
|
2361 chomp $mismatch_info_2;
|
|
2362 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
|
|
2363 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
|
|
2364 $chromosome_1 = $mapped_chromosome_1;
|
|
2365 }
|
|
2366 else{
|
|
2367 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
|
|
2368 }
|
|
2369 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
|
|
2370 $chromosome_2 = $mapped_chromosome_2;
|
|
2371 }
|
|
2372 else{
|
|
2373 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
|
|
2374 }
|
|
2375
|
|
2376 $number_of_mismatches_1='';
|
|
2377 $number_of_mismatches_2='';
|
|
2378 ### Now extracting the number of mismatches to the converted genome
|
|
2379 if ($mismatch_info_1 eq ''){
|
|
2380 $number_of_mismatches_1 = 0;
|
|
2381 }
|
|
2382 elsif ($mismatch_info_1 =~ /^\d/){
|
|
2383 my @mismatches = split (/,/,$mismatch_info_1);
|
|
2384 $number_of_mismatches_1 = scalar @mismatches;
|
|
2385 }
|
|
2386 else{
|
|
2387 die "Something weird is going on with the mismatch field\n";
|
|
2388 }
|
|
2389 if ($mismatch_info_2 eq ''){
|
|
2390 $number_of_mismatches_2 = 0;
|
|
2391 }
|
|
2392 elsif ($mismatch_info_2 =~ /^\d/){
|
|
2393 my @mismatches = split (/,/,$mismatch_info_2);
|
|
2394 $number_of_mismatches_2 = scalar @mismatches;
|
|
2395 }
|
|
2396 else{
|
|
2397 die "Something weird is going on with the mismatch field\n";
|
|
2398 }
|
|
2399 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
|
|
2400 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
|
|
2401 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
|
|
2402 die "position 1 is greater than position 2" if ($position_1 > $position_2);
|
|
2403 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
|
|
2404 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
|
|
2405 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
|
|
2406 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
|
|
2407 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
|
|
2408 ### number for the found alignment)
|
|
2409 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
|
|
2410 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
|
|
2411 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
|
|
2412 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
|
|
2413 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
|
|
2414 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
|
|
2415 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
|
|
2416 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
|
|
2417 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
|
|
2418 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
|
|
2419 }
|
|
2420 ###############################################################################################################################################
|
|
2421 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
|
|
2422 ###############################################################################################################################################
|
|
2423 $newline_1 = $fhs[$index]->{fh}-> getline();
|
|
2424 $newline_2 = $fhs[$index]->{fh}-> getline();
|
|
2425
|
|
2426 if ($newline_1 and $newline_2){
|
|
2427 my ($seq_id_1) = split (/\t/,$newline_1);
|
|
2428 my ($seq_id_2) = split (/\t/,$newline_2);
|
|
2429
|
|
2430 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
|
|
2431 $fhs[$index]->{last_seq_id} = $seq_id_1;
|
|
2432 }
|
|
2433 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
|
|
2434 $fhs[$index]->{last_seq_id} = $seq_id_2;
|
|
2435 }
|
|
2436 $fhs[$index]->{last_line_1} = $newline_1;
|
|
2437 $fhs[$index]->{last_line_2} = $newline_2;
|
|
2438 }
|
|
2439 else {
|
|
2440 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
|
|
2441 $fhs[$index]->{last_seq_id} = undef;
|
|
2442 $fhs[$index]->{last_line_1} = undef;
|
|
2443 $fhs[$index]->{last_line_2} = undef;
|
|
2444 next; # jumping to the next index
|
|
2445 }
|
|
2446 ### within the 2nd sequence pair alignment in correct orientation found
|
|
2447 }
|
|
2448 ### within the 1st sequence pair alignment in correct orientation found
|
|
2449 }
|
|
2450 ### still within the (last_seq_id eq identifier) condition
|
|
2451 }
|
|
2452 ### still within foreach index loop
|
|
2453 }
|
|
2454 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
|
|
2455 unless(%mismatches){
|
|
2456 $counting{no_single_alignment_found}++;
|
|
2457 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
|
|
2458 }
|
|
2459 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
|
|
2460 my $sequence_pair_fails = 0;
|
|
2461 ### Declaring an empty hash reference which will store all information we need for the methylation call
|
|
2462 my $methylation_call_params; # hash reference!
|
|
2463 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
|
|
2464 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
|
|
2465 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
|
|
2466 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
|
|
2467 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
|
|
2468 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
|
|
2469 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
|
|
2470 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
|
|
2471 }
|
|
2472 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
|
|
2473 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
|
|
2474 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
|
|
2475 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
|
|
2476 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
|
|
2477 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
|
|
2478 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
|
|
2479 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
|
|
2480 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
|
|
2481 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
|
|
2482 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
|
|
2483 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
|
|
2484 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
|
|
2485 }
|
|
2486 }
|
|
2487 else{
|
|
2488 $sequence_pair_fails = 1;
|
|
2489 }
|
|
2490 ### after processing the alignment with the lowest number of mismatches we exit
|
|
2491 last;
|
|
2492 }
|
|
2493 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
|
|
2494 if ($sequence_pair_fails == 1){
|
|
2495 $counting{unsuitable_sequence_count}++;
|
|
2496 if ($ambiguous){
|
|
2497 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
|
|
2498 }
|
|
2499 if ($unmapped){
|
|
2500 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
|
|
2501 }
|
|
2502 else{
|
|
2503 return 0; # => exits to next sequence (default)
|
|
2504 }
|
|
2505 }
|
|
2506
|
|
2507 ### --DIRECTIONAL
|
|
2508 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
|
|
2509 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
|
|
2510 if ($directional){
|
|
2511 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
|
|
2512 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
|
|
2513 $counting{alignments_rejected_count}++;
|
|
2514 return 0;
|
|
2515 }
|
|
2516 }
|
|
2517
|
|
2518 ### If the sequence has not been rejected so far it does have a unique best alignment
|
|
2519 $counting{unique_best_alignment_count}++;
|
|
2520 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
|
|
2521
|
|
2522 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
|
|
2523 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
|
|
2524 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
|
|
2525 $counting{genomic_sequence_could_not_be_extracted_count}++;
|
|
2526 return 0;
|
|
2527 }
|
|
2528 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
|
|
2529 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
|
|
2530 $counting{genomic_sequence_could_not_be_extracted_count}++;
|
|
2531 return 0;
|
|
2532 }
|
|
2533
|
|
2534 ### otherwise we are set to perform the actual methylation call
|
|
2535 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
|
|
2536 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
|
|
2537
|
|
2538 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
|
|
2539 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
|
|
2540 }
|
|
2541
|
|
2542 #########################
|
|
2543 ### BOWTIE 2 | PAIRED-END
|
|
2544 #########################
|
|
2545
|
|
2546 sub check_bowtie_results_paired_ends_bowtie2{
|
|
2547 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
|
|
2548
|
|
2549 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
|
|
2550 unless ($quality_value_1){
|
|
2551 $quality_value_1 = 'I'x(length$sequence_1);
|
|
2552 }
|
|
2553
|
|
2554 unless ($quality_value_2){
|
|
2555 $quality_value_2 = 'I'x(length$sequence_2);
|
|
2556 }
|
|
2557
|
|
2558
|
|
2559 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
|
|
2560
|
|
2561
|
|
2562 my %alignments;
|
|
2563 my $alignment_ambiguous = 0;
|
|
2564
|
|
2565 ### reading from the Bowtie 2 output filehandles
|
|
2566
|
|
2567 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
|
|
2568 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
|
|
2569 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
|
|
2570 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
|
|
2571 ### strands are not being reported when '--directional' is specified
|
|
2572
|
|
2573 foreach my $index (0,3,1,2){
|
|
2574 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
|
|
2575 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
|
|
2576
|
|
2577 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
|
|
2578 if ($fhs[$index]->{last_seq_id} eq $identifier) {
|
|
2579
|
|
2580 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
|
|
2581 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
|
|
2582 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
|
|
2583 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
|
|
2584 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
|
|
2585 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
|
|
2586 $id_1 =~ s/\/1$//;
|
|
2587 $id_2 =~ s/\/2$//;
|
|
2588
|
|
2589 # SAM format specifications for Bowtie 2
|
|
2590 # (1) Name of read that aligned
|
|
2591 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
|
|
2592 # 1 The read is one of a pair
|
|
2593 # 2 The alignment is one end of a proper paired-end alignment
|
|
2594 # 4 The read has no reported alignments
|
|
2595 # 8 The read is one of a pair and has no reported alignments
|
|
2596 # 16 The alignment is to the reverse reference strand
|
|
2597 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
|
|
2598 # 64 The read is mate 1 in a pair
|
|
2599 # 128 The read is mate 2 in a pair
|
|
2600 # 256 The read has multiple mapping states
|
|
2601 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
|
|
2602 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
|
|
2603 # (5) Mapping quality (255 means MAPQ is not available)
|
|
2604 # (6) CIGAR string representation of alignment (* if unavailable)
|
|
2605 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
|
|
2606 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
|
|
2607 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
|
|
2608 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
|
|
2609 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
|
|
2610 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
|
|
2611 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
|
|
2612 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
|
|
2613 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
|
|
2614 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
|
|
2615 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
|
|
2616 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
|
|
2617 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
|
|
2618 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
|
|
2619 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
|
|
2620 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
|
|
2621
|
|
2622 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
|
|
2623 ### We can store the next alignment and move on to the next Bowtie 2 instance
|
|
2624 if ($flag_1 == 77 and $flag_2 == 141){
|
|
2625 ## reading in the next alignment, which must be the next sequence
|
|
2626 my $newline_1 = $fhs[$index]->{fh}-> getline();
|
|
2627 my $newline_2 = $fhs[$index]->{fh}-> getline();
|
|
2628
|
|
2629 if ($newline_1 and $newline_2){
|
|
2630 chomp $newline_1;
|
|
2631 chomp $newline_2;
|
|
2632 my ($seq_id_1) = split (/\t/,$newline_1);
|
|
2633 my ($seq_id_2) = split (/\t/,$newline_2);
|
|
2634 $seq_id_1 =~ s/\/1$//;
|
|
2635 $seq_id_2 =~ s/\/2$//;
|
|
2636 $fhs[$index]->{last_seq_id} = $seq_id_1;
|
|
2637 $fhs[$index]->{last_line_1} = $newline_1;
|
|
2638 $fhs[$index]->{last_line_2} = $newline_2;
|
|
2639
|
|
2640 # print "current sequence ($identifier) did not map, reading in next sequence\n";
|
|
2641 # print "$index\t$fhs[$index]->{last_seq_id}\n";
|
|
2642 # print "$index\t$fhs[$index]->{last_line_1}\n";
|
|
2643 # print "$index\t$fhs[$index]->{last_line_2}\n";
|
|
2644 next; # next instance
|
|
2645 }
|
|
2646 else{
|
|
2647 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
|
|
2648 $fhs[$index]->{last_seq_id} = undef;
|
|
2649 $fhs[$index]->{last_line_1} = undef;
|
|
2650 $fhs[$index]->{last_line_2} = undef;
|
|
2651 next;
|
|
2652 }
|
|
2653 }
|
|
2654
|
|
2655 ### If there are one or more proper alignments we can extract the chromosome number
|
|
2656 my ($chromosome_1,$chromosome_2);
|
|
2657 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
|
|
2658 $chromosome_1 = $mapped_chromosome_1;
|
|
2659 }
|
|
2660 else{
|
|
2661 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
|
|
2662 }
|
|
2663 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
|
|
2664 $chromosome_2 = $mapped_chromosome_2;
|
|
2665 }
|
|
2666 else{
|
|
2667 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
|
|
2668 }
|
|
2669
|
|
2670 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
|
|
2671
|
|
2672 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
|
|
2673 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
|
|
2674
|
|
2675 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
|
|
2676 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
|
|
2677
|
|
2678 foreach (11..$#fields_1){
|
|
2679 if ($fields_1[$_] =~ /AS:i:(.*)/){
|
|
2680 $alignment_score_1 = $1;
|
|
2681 }
|
|
2682 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
|
|
2683 $second_best_1 = $1;
|
|
2684 }
|
|
2685 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
|
|
2686 $MD_tag_1 = $1;
|
|
2687 }
|
|
2688 }
|
|
2689
|
|
2690 foreach (11..$#fields_2){
|
|
2691 if ($fields_2[$_] =~ /AS:i:(.*)/){
|
|
2692 $alignment_score_2 = $1;
|
|
2693 }
|
|
2694 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
|
|
2695 $second_best_2 = $1;
|
|
2696 }
|
|
2697 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
|
|
2698 $MD_tag_2 = $1;
|
|
2699 }
|
|
2700 }
|
|
2701
|
|
2702 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
|
|
2703 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
|
|
2704
|
|
2705 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
|
|
2706 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
|
|
2707 # warn "MD tag 1 is: '$MD_tag_1'\n";
|
|
2708 # warn "MD tag 2 is: '$MD_tag_2'\n";
|
|
2709
|
|
2710 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
|
|
2711 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
|
|
2712 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
|
|
2713
|
|
2714 if (defined $second_best_1 and defined $second_best_2){
|
|
2715 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
|
|
2716 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
|
|
2717 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
|
|
2718 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
|
|
2719
|
|
2720 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
|
|
2721 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
|
|
2722 $alignment_ambiguous = 1;
|
|
2723 # print "This read will be chucked (AS==XS detected)!\n";
|
|
2724
|
|
2725 ## need to read and discard all additional ambiguous reads until we reach the next sequence
|
|
2726 until ($fhs[$index]->{last_seq_id} ne $identifier){
|
|
2727 my $newline_1 = $fhs[$index]->{fh}-> getline();
|
|
2728 my $newline_2 = $fhs[$index]->{fh}-> getline();
|
|
2729 if ($newline_1 and $newline_2){
|
|
2730 chomp $newline_1;
|
|
2731 chomp $newline_2;
|
|
2732 my ($seq_id_1) = split (/\t/,$newline_1);
|
|
2733 my ($seq_id_2) = split (/\t/,$newline_2);
|
|
2734 $seq_id_1 =~ s/\/1$//;
|
|
2735 $seq_id_2 =~ s/\/2$//;
|
|
2736 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
|
|
2737
|
|
2738 $fhs[$index]->{last_seq_id} = $seq_id_1;
|
|
2739 $fhs[$index]->{last_line_1} = $newline_1;
|
|
2740 $fhs[$index]->{last_line_2} = $newline_2;
|
|
2741 }
|
|
2742 else{
|
|
2743 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
|
|
2744 $fhs[$index]->{last_seq_id} = undef;
|
|
2745 $fhs[$index]->{last_line_1} = undef;
|
|
2746 $fhs[$index]->{last_line_2} = undef;
|
|
2747 last; # break free if the end of the alignment output was reached
|
|
2748 }
|
|
2749 }
|
|
2750 # if ($fhs[$index]->{last_seq_id}){
|
|
2751 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
|
|
2752 # }
|
|
2753 }
|
|
2754 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
|
|
2755
|
|
2756 my $alignment_location;
|
|
2757 if ($position_1 <= $position_2){
|
|
2758 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
|
|
2759 }
|
|
2760 elsif($position_2 < $position_1){
|
|
2761 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
|
|
2762 }
|
|
2763
|
|
2764 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
|
|
2765 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
|
|
2766 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
|
|
2767 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
|
|
2768
|
|
2769 unless (exists $alignments{$alignment_location}){
|
|
2770 $alignments{$alignment_location}->{seq_id} = $id_1;
|
|
2771 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
|
|
2772 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
|
|
2773 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
|
|
2774 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
|
|
2775 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
|
|
2776 $alignments{$alignment_location}->{index} = $index;
|
|
2777 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
|
|
2778 $alignments{$alignment_location}->{position_1} = $position_1;
|
|
2779 $alignments{$alignment_location}->{position_2} = $position_2;
|
|
2780 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
|
|
2781 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
|
|
2782 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
|
|
2783 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
|
|
2784 $alignments{$alignment_location}->{flag_1} = $flag_1;
|
|
2785 $alignments{$alignment_location}->{flag_2} = $flag_2;
|
|
2786 }
|
|
2787 # warn "added best of several alignments to \%alignments hash\n";
|
|
2788
|
|
2789 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
|
|
2790 until ($fhs[$index]->{last_seq_id} ne $identifier){
|
|
2791 my $newline_1 = $fhs[$index]->{fh}-> getline();
|
|
2792 my $newline_2 = $fhs[$index]->{fh}-> getline();
|
|
2793 if ($newline_1 and $newline_2){
|
|
2794 chomp $newline_1;
|
|
2795 chomp $newline_2;
|
|
2796 my ($seq_id_1) = split (/\t/,$newline_1);
|
|
2797 my ($seq_id_2) = split (/\t/,$newline_2);
|
|
2798 $seq_id_1 =~ s/\/1$//;
|
|
2799 $seq_id_2 =~ s/\/2$//;
|
|
2800 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
|
|
2801
|
|
2802 $fhs[$index]->{last_seq_id} = $seq_id_1;
|
|
2803 $fhs[$index]->{last_line_1} = $newline_1;
|
|
2804 $fhs[$index]->{last_line_2} = $newline_2;
|
|
2805 }
|
|
2806 else{
|
|
2807 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
|
|
2808 $fhs[$index]->{last_seq_id} = undef;
|
|
2809 $fhs[$index]->{last_line_1} = undef;
|
|
2810 $fhs[$index]->{last_line_2} = undef;
|
|
2811 last; # break free if the end of the alignment output was reached
|
|
2812 }
|
|
2813 }
|
|
2814 # if($fhs[$index]->{last_seq_id}){
|
|
2815 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
|
|
2816 # }
|
|
2817 }
|
|
2818 }
|
|
2819 else{ # there is no second best hit, so we can just store this one and read in the next sequence
|
|
2820
|
|
2821 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
|
|
2822 # print "$alignment_location\n";
|
|
2823 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
|
|
2824 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
|
|
2825 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
|
|
2826 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
|
|
2827
|
|
2828 unless (exists $alignments{$alignment_location}){
|
|
2829 $alignments{$alignment_location}->{seq_id} = $id_1;
|
|
2830 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
|
|
2831 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
|
|
2832 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
|
|
2833 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
|
|
2834 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
|
|
2835 $alignments{$alignment_location}->{index} = $index;
|
|
2836 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
|
|
2837 $alignments{$alignment_location}->{position_1} = $position_1;
|
|
2838 $alignments{$alignment_location}->{position_2} = $position_2;
|
|
2839 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
|
|
2840 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
|
|
2841 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
|
|
2842 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
|
|
2843 $alignments{$alignment_location}->{flag_1} = $flag_1;
|
|
2844 $alignments{$alignment_location}->{flag_2} = $flag_2;
|
|
2845 }
|
|
2846
|
|
2847 # warn "added unique alignment to \%alignments hash\n";
|
|
2848
|
|
2849 # Now reading and storing the next read pair
|
|
2850 my $newline_1 = $fhs[$index]->{fh}-> getline();
|
|
2851 my $newline_2 = $fhs[$index]->{fh}-> getline();
|
|
2852 if ($newline_1 and $newline_2){
|
|
2853 chomp $newline_1;
|
|
2854 chomp $newline_2;
|
|
2855 # print "$newline_1\n";
|
|
2856 # print "$newline_2\n";
|
|
2857 my ($seq_id_1) = split (/\t/,$newline_1);
|
|
2858 my ($seq_id_2) = split (/\t/,$newline_2);
|
|
2859 $seq_id_1 =~ s/\/1$//;
|
|
2860 $seq_id_2 =~ s/\/2$//;
|
|
2861 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
|
|
2862
|
|
2863 $fhs[$index]->{last_seq_id} = $seq_id_1;
|
|
2864 $fhs[$index]->{last_line_1} = $newline_1;
|
|
2865 $fhs[$index]->{last_line_2} = $newline_2;
|
|
2866
|
|
2867 if ($seq_id_1 eq $identifier){
|
|
2868 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
|
|
2869 }
|
|
2870 }
|
|
2871 else{
|
|
2872 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
|
|
2873 $fhs[$index]->{last_seq_id} = undef;
|
|
2874 $fhs[$index]->{last_line_1} = undef;
|
|
2875 $fhs[$index]->{last_line_2} = undef;
|
|
2876 }
|
|
2877 }
|
|
2878 }
|
|
2879 }
|
|
2880
|
|
2881 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
|
|
2882 if ($alignment_ambiguous == 1){
|
|
2883 $counting{unsuitable_sequence_count}++;
|
|
2884 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
|
|
2885 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
|
|
2886 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
|
|
2887 # print "$ambiguous_read_1\n";
|
|
2888 # print "$ambiguous_read_2\n";
|
|
2889
|
|
2890 if ($ambiguous){
|
|
2891 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
|
|
2892 }
|
|
2893 elsif ($unmapped){
|
|
2894 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
|
|
2895 }
|
|
2896 else{
|
|
2897 return 0;
|
|
2898 }
|
|
2899 }
|
|
2900
|
|
2901 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
|
|
2902 unless (%alignments){
|
|
2903 $counting{no_single_alignment_found}++;
|
|
2904
|
|
2905 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
|
|
2906 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
|
|
2907 # print "$unmapped_read_1\n";
|
|
2908 # print "$unmapped_read_2\n";
|
|
2909 if ($unmapped){
|
|
2910 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
|
|
2911 }
|
|
2912 else{
|
|
2913 return 0;
|
|
2914 }
|
|
2915 }
|
|
2916
|
|
2917 #######################################################################################################################################################
|
|
2918
|
|
2919 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
|
|
2920 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
|
|
2921 ### alignment score we are discarding the sequence pair altogether.
|
|
2922 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
|
|
2923 ### and extending (3 per bp) the gap.
|
|
2924
|
|
2925 #######################################################################################################################################################
|
|
2926
|
|
2927 ### Declaring an empty hash reference which will store all information we need for the methylation call
|
|
2928 my $methylation_call_params; # hash reference
|
|
2929 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
|
|
2930
|
|
2931 ### print contents of %alignments for debugging
|
|
2932 ## if (scalar keys %alignments >= 1){
|
|
2933 # print "\n******\n";
|
|
2934 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
|
|
2935 # print "Loc: $alignment_location\n";
|
|
2936 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
|
|
2937 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
|
|
2938 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
|
|
2939 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
|
|
2940 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
|
|
2941 # print "Index $alignments{$alignment_location}->{index}\n";
|
|
2942 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
|
|
2943 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
|
|
2944 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
|
|
2945 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
|
|
2946 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
|
|
2947 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
|
|
2948 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
|
|
2949 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
|
|
2950 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
|
|
2951 # }
|
|
2952 # print "\n******\n";
|
|
2953 # }
|
|
2954
|
|
2955 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
|
|
2956 if (scalar keys %alignments == 1){
|
|
2957 for my $unique_best_alignment (keys %alignments){
|
|
2958 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
|
|
2959 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
|
|
2960 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
|
|
2961 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
|
|
2962 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
|
|
2963 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
|
|
2964 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
|
|
2965 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
|
|
2966 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
|
|
2967 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
|
|
2968 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
|
|
2969 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
|
|
2970 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
|
|
2971 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
|
|
2972 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
|
|
2973 }
|
|
2974 }
|
|
2975
|
|
2976 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
|
|
2977 ### we boot the sequence pair altogether)
|
|
2978 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
|
|
2979 my $best_sum_of_alignment_scores;
|
|
2980 my $best_alignment_location;
|
|
2981 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
|
|
2982 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
|
|
2983 unless (defined $best_sum_of_alignment_scores){
|
|
2984 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
|
|
2985 $best_alignment_location = $alignment_location;
|
|
2986 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
|
|
2987 }
|
|
2988 else{
|
|
2989 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
|
|
2990 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
|
|
2991 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
|
|
2992 $sequence_pair_fails = 1;
|
|
2993 last; # exiting since we know that the sequence has ambiguous alignments
|
|
2994 }
|
|
2995 ### else we are going to store the best alignment for further processing
|
|
2996 else{
|
|
2997 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
|
|
2998 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
|
|
2999 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
|
|
3000 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
|
|
3001 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
|
|
3002 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
|
|
3003 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
|
|
3004 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
|
|
3005 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
|
|
3006 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
|
|
3007 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
|
|
3008 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
|
|
3009 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
|
|
3010 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
|
|
3011 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
|
|
3012 last; # exiting since the sequence produced a unique best alignment
|
|
3013 }
|
|
3014 }
|
|
3015 }
|
|
3016 }
|
|
3017 else{
|
|
3018 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
|
|
3019 }
|
|
3020
|
|
3021 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
|
|
3022 if ($sequence_pair_fails == 1){
|
|
3023 $counting{unsuitable_sequence_count}++;
|
|
3024
|
|
3025 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
|
|
3026 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
|
|
3027 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
|
|
3028 # print "$ambiguous_read_1\n";
|
|
3029 # print "$ambiguous_read_2\n";
|
|
3030
|
|
3031 if ($ambiguous){
|
|
3032 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
|
|
3033 }
|
|
3034 elsif ($unmapped){
|
|
3035 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
|
|
3036 }
|
|
3037 else{
|
|
3038 return 0; # => exits to next sequence pair (default)
|
|
3039 }
|
|
3040 }
|
|
3041
|
|
3042 ### --DIRECTIONAL
|
|
3043 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
|
|
3044 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
|
|
3045 if ($directional){
|
|
3046 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
|
|
3047 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
|
|
3048 $counting{alignments_rejected_count}++;
|
|
3049 return 0;
|
|
3050 }
|
|
3051 }
|
|
3052
|
|
3053 ### If the sequence pair has not been rejected so far it does have a unique best alignment
|
|
3054 $counting{unique_best_alignment_count}++;
|
|
3055 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
|
|
3056
|
|
3057 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
|
|
3058 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
|
3
|
3059 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n";
|
0
|
3060 $counting{genomic_sequence_could_not_be_extracted_count}++;
|
|
3061 return 0;
|
|
3062 }
|
|
3063 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
|
3
|
3064 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n";
|
0
|
3065 $counting{genomic_sequence_could_not_be_extracted_count}++;
|
|
3066 return 0;
|
|
3067 }
|
|
3068
|
|
3069 ### now we are set to perform the actual methylation call
|
|
3070 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
|
|
3071 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
|
|
3072 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
|
|
3073 # print " $sequence_2\n";
|
|
3074 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
|
|
3075 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
|
|
3076
|
|
3077 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
|
|
3078 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
|
|
3079 }
|
|
3080
|
|
3081 ###
|
|
3082
|
|
3083 sub decide_whether_paired_end_alignment_is_valid{
|
|
3084 my ($index,$identifier) = @_;
|
|
3085 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
|
|
3086 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
|
|
3087 chomp $mismatch_info_1;
|
|
3088 chomp $mismatch_info_2;
|
|
3089 my $seq_id_1 = $id_1;
|
|
3090 my $seq_id_2 = $id_2;
|
|
3091 $seq_id_1 =~ s/\/1$//; # removing the read /1
|
|
3092 $seq_id_2 =~ s/\/1$//; # removing the read /1
|
|
3093
|
|
3094 ### ensuring that the current entry is the correct sequence
|
|
3095 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
|
|
3096 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
|
|
3097 ### sensible alignments
|
|
3098 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
|
|
3099 ### If the orientation was correct can we move on
|
|
3100 if ($orientation == 1){
|
|
3101 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
|
|
3102 }
|
|
3103 ### If the alignment was in the wrong orientation we need to read in two new lines
|
|
3104 elsif($orientation == 0){
|
|
3105 my $newline_1 = $fhs[$index]->{fh}->getline();
|
|
3106 my $newline_2 = $fhs[$index]->{fh}->getline();
|
|
3107 if ($newline_1 and $newline_2){
|
|
3108 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
|
|
3109 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
|
|
3110 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
|
|
3111
|
|
3112 my $seqid;
|
|
3113 $seq_id_1 = $id_1;
|
|
3114 $seq_id_2 = $id_2;
|
|
3115 # we need to capture the first read (ending on /1)
|
|
3116 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
|
|
3117 $seqid = $seq_id_1;
|
|
3118 }
|
|
3119 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
|
|
3120 $seqid = $seq_id_2;
|
|
3121 }
|
|
3122 else{
|
|
3123 die "One of the two reads needs to end on /1!!";
|
|
3124 }
|
|
3125
|
|
3126 ### ensuring that the next entry is still the correct sequence
|
|
3127 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
|
|
3128 ### checking orientation again
|
|
3129 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
|
|
3130 ### If the orientation was correct can we move on
|
|
3131 if ($orientation == 1){
|
|
3132 ### Writing the current sequence to last_line_1 and last_line_2
|
|
3133 $fhs[$index]->{last_seq_id} = $seqid;
|
|
3134 $fhs[$index]->{last_line_1} = $newline_1;
|
|
3135 $fhs[$index]->{last_line_2} = $newline_2;
|
|
3136 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
|
|
3137 }
|
|
3138 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
|
|
3139 ### the next entry)
|
|
3140 elsif ($orientation == 0){
|
|
3141 $newline_1 = $fhs[$index]->{fh}->getline();
|
|
3142 $newline_2 = $fhs[$index]->{fh}->getline();
|
|
3143 if ($newline_1 and $newline_2){
|
|
3144 ($seq_id_1) = split (/\t/,$newline_1);
|
|
3145 ($seq_id_2) = split (/\t/,$newline_2);
|
|
3146
|
|
3147 $seqid = '';
|
|
3148 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
|
|
3149 $seqid = $seq_id_1;
|
|
3150 }
|
|
3151 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
|
|
3152 $seqid = $seq_id_2;
|
|
3153 }
|
|
3154 else{
|
|
3155 die "One of the two reads needs to end on /1!!";
|
|
3156 }
|
|
3157
|
|
3158 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
|
|
3159 ### the same fields of the just read next entry
|
|
3160 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
|
|
3161 $fhs[$index]->{last_seq_id} = $seqid;
|
|
3162 $fhs[$index]->{last_line_1} = $newline_1;
|
|
3163 $fhs[$index]->{last_line_2} = $newline_2;
|
|
3164 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
|
|
3165 }
|
|
3166 else {
|
|
3167 ### assigning undef to last_seq_id and last_line (end of bowtie output)
|
|
3168 $fhs[$index]->{last_seq_id} = undef;
|
|
3169 $fhs[$index]->{last_line_1} = undef;
|
|
3170 $fhs[$index]->{last_line_2} = undef;
|
|
3171 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
|
|
3172 }
|
|
3173 }
|
|
3174 else{
|
|
3175 die "The orientation of the alignment must be either correct or incorrect\n";
|
|
3176 }
|
|
3177 }
|
|
3178 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
|
|
3179 else{
|
|
3180 $fhs[$index]->{last_seq_id} = $seqid;
|
|
3181 $fhs[$index]->{last_line_1} = $newline_1;
|
|
3182 $fhs[$index]->{last_line_2} = $newline_2;
|
|
3183 return 0; # processing the new alignment result only in the next round
|
|
3184 }
|
|
3185 }
|
|
3186 else {
|
|
3187 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
|
|
3188 $fhs[$index]->{last_seq_id} = undef;
|
|
3189 $fhs[$index]->{last_line_1} = undef;
|
|
3190 $fhs[$index]->{last_line_2} = undef;
|
|
3191 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
|
|
3192 }
|
|
3193 }
|
|
3194 else{
|
|
3195 die "The orientation of the alignment must be either correct or incorrect\n";
|
|
3196 }
|
|
3197 }
|
|
3198 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
|
|
3199 else{
|
|
3200 return 0;
|
|
3201 }
|
|
3202 }
|
|
3203
|
|
3204 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
|
|
3205
|
|
3206 sub extract_corresponding_genomic_sequence_paired_ends {
|
|
3207 my ($sequence_identifier,$methylation_call_params) = @_;
|
|
3208 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
|
|
3209 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
|
|
3210 my $alignment_read_1;
|
|
3211 my $alignment_read_2;
|
|
3212 my $read_conversion_info_1;
|
|
3213 my $read_conversion_info_2;
|
|
3214 my $genome_conversion;
|
|
3215
|
|
3216 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
|
|
3217 ### if the C happens to be at the first or last position of the actually observed sequence
|
|
3218 my $non_bisulfite_sequence_1;
|
|
3219 my $non_bisulfite_sequence_2;
|
|
3220
|
|
3221 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
|
|
3222 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
|
|
3223 ### sequences around!
|
|
3224 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
|
|
3225 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
|
|
3226 ### [Index 0, sequence originated from (converted) forward strand]
|
|
3227 $counting{CT_GA_CT_count}++;
|
|
3228 $alignment_read_1 = '+';
|
|
3229 $alignment_read_2 = '-';
|
|
3230 $read_conversion_info_1 = 'CT';
|
|
3231 $read_conversion_info_2 = 'GA';
|
|
3232 $genome_conversion = 'CT';
|
|
3233 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
|
|
3234 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
|
|
3235
|
|
3236 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
|
|
3237
|
|
3238 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
|
|
3239 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
|
|
3240 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
|
|
3241
|
|
3242 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
|
|
3243 ### the reverse strand sequence needs to be reverse complemented
|
|
3244 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
|
|
3245 }
|
|
3246 else{
|
|
3247 $non_bisulfite_sequence_2 = '';
|
|
3248 }
|
|
3249 }
|
|
3250
|
|
3251 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
|
|
3252 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
|
|
3253 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
|
|
3254 $counting{GA_CT_GA_count}++;
|
|
3255 $alignment_read_1 = '+';
|
|
3256 $alignment_read_2 = '-';
|
|
3257 $read_conversion_info_1 = 'GA';
|
|
3258 $read_conversion_info_2 = 'CT';
|
|
3259 $genome_conversion = 'GA';
|
|
3260
|
|
3261 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
|
|
3262 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
|
|
3263 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
|
|
3264 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
|
|
3265 }
|
|
3266 else{
|
|
3267 $non_bisulfite_sequence_1 = '';
|
|
3268 }
|
|
3269
|
|
3270 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
|
|
3271 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
|
|
3272 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
|
|
3273 ### the reverse strand sequence needs to be reverse complemented
|
|
3274 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
|
|
3275 }
|
|
3276
|
|
3277 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
|
|
3278 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
|
|
3279 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
|
|
3280 $counting{GA_CT_CT_count}++;
|
|
3281 $alignment_read_1 = '-';
|
|
3282 $alignment_read_2 = '+';
|
|
3283 $read_conversion_info_1 = 'GA';
|
|
3284 $read_conversion_info_2 = 'CT';
|
|
3285 $genome_conversion = 'CT';
|
|
3286
|
|
3287 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
|
|
3288 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
|
|
3289 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
|
|
3290 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
|
|
3291 ### the reverse strand sequence needs to be reverse complemented
|
|
3292 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
|
|
3293
|
|
3294 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
|
|
3295 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
|
|
3296 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
|
|
3297 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
|
|
3298 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
|
|
3299 }
|
|
3300 else{
|
|
3301 $non_bisulfite_sequence_2 = '';
|
|
3302 }
|
|
3303 }
|
|
3304
|
|
3305 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
|
|
3306 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
|
|
3307 ### [Index 3, sequence originated from the (converted) reverse strand]
|
|
3308 $counting{CT_GA_GA_count}++;
|
|
3309 $alignment_read_1 = '-';
|
|
3310 $alignment_read_2 = '+';
|
|
3311 $read_conversion_info_1 = 'CT';
|
|
3312 $read_conversion_info_2 = 'GA';
|
|
3313 $genome_conversion = 'GA';
|
|
3314
|
|
3315 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
|
|
3316 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
|
|
3317 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
|
|
3318 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
|
|
3319 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
|
|
3320 ### the reverse strand sequence needs to be reverse complemented
|
|
3321 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
|
|
3322 }
|
|
3323 else{
|
|
3324 $non_bisulfite_sequence_1 = '';
|
|
3325 }
|
|
3326
|
|
3327 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
|
|
3328 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
|
|
3329 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
|
|
3330 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
|
|
3331 }
|
|
3332 else{
|
|
3333 die "Too many bowtie result filehandles\n";
|
|
3334 }
|
|
3335 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
|
|
3336 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
|
|
3337
|
|
3338 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
|
|
3339 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
|
|
3340 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
|
|
3341 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
|
|
3342 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
|
|
3343 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
|
|
3344 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
|
|
3345 }
|
|
3346
|
|
3347 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
|
|
3348
|
|
3349 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
|
|
3350 my ($sequence_identifier,$methylation_call_params) = @_;
|
|
3351 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
|
|
3352 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
|
|
3353
|
|
3354 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
|
|
3355 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
|
|
3356 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
|
|
3357 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
|
3
|
3358 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
|
|
3359 # sleep(10);
|
0
|
3360 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
|
|
3361 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
|
|
3362
|
|
3363 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
|
|
3364 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
|
|
3365 my $alignment_read_1;
|
|
3366 my $alignment_read_2;
|
|
3367 my $read_conversion_info_1;
|
|
3368 my $read_conversion_info_2;
|
|
3369 my $genome_conversion;
|
|
3370
|
|
3371 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
|
|
3372 ### if the C happens to be at the last position of the actually observed sequence
|
|
3373 my $non_bisulfite_sequence_1 = '';
|
|
3374 my $non_bisulfite_sequence_2 = '';
|
|
3375
|
|
3376 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
|
|
3377 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
|
|
3378 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
|
|
3379
|
|
3380 # parsing CIGAR 1 string
|
|
3381 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
|
|
3382 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
|
|
3383 shift @ops_1; # remove the empty first element
|
|
3384 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
|
|
3385 # parsing CIGAR 2 string
|
|
3386 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
|
|
3387 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
|
|
3388 shift @ops_2; # remove the empty first element
|
|
3389 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
|
|
3390
|
|
3391 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
|
|
3392 my $indels_2 = 0;
|
3
|
3393
|
0
|
3394 ### Extracting read 1 genomic sequence ###
|
|
3395
|
|
3396 # extracting 2 additional bp at the 5' end (read 1)
|
|
3397 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
|
|
3398 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3399 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
|
|
3400 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
|
|
3401 return;
|
|
3402 }
|
|
3403 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
|
|
3404 }
|
|
3405
|
|
3406 foreach (0..$#len_1){
|
|
3407 if ($ops_1[$_] eq 'M'){
|
|
3408 # extracting genomic sequence
|
|
3409 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
|
|
3410 # warn "$non_bisulfite_sequence_1\n";
|
|
3411 # adjusting position
|
|
3412 $pos_1 += $len_1[$_];
|
|
3413 }
|
|
3414 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
|
|
3415 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
|
|
3416 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
|
|
3417 # warn "$non_bisulfite_sequence_1\n";
|
|
3418 # position doesn't need adjusting
|
|
3419 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
|
|
3420 }
|
|
3421 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
|
|
3422 # we do not add any genomic sequence but only adjust the position
|
|
3423 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
|
|
3424 $pos_1 += $len_1[$_];
|
|
3425 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
|
|
3426 }
|
|
3427 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
|
|
3428 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
|
|
3429 }
|
|
3430 else{
|
|
3431 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
|
|
3432 }
|
|
3433 }
|
|
3434
|
|
3435 ### 3' end of read 1
|
|
3436 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
|
|
3437 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3438 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
|
|
3439 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
|
|
3440 return;
|
|
3441 }
|
3
|
3442
|
0
|
3443 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
|
|
3444 }
|
|
3445
|
|
3446
|
|
3447 ### Extracting read 2 genomic sequence ###
|
|
3448
|
|
3449 ### 5' end of read 2
|
|
3450 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
|
|
3451 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3452 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
|
|
3453 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
|
|
3454 return;
|
|
3455 }
|
|
3456 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
|
|
3457 }
|
|
3458
|
|
3459 foreach (0..$#len_2){
|
|
3460 if ($ops_2[$_] eq 'M'){
|
|
3461 # extracting genomic sequence
|
|
3462 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
|
|
3463 # warn "$non_bisulfite_sequence_2\n";
|
|
3464 # adjusting position
|
|
3465 $pos_2 += $len_2[$_];
|
|
3466 }
|
|
3467 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
|
|
3468 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
|
|
3469 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
|
|
3470 # warn "$non_bisulfite_sequence_2\n";
|
|
3471 # position doesn't need adjusting
|
|
3472 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
|
|
3473 }
|
|
3474 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
|
|
3475 # we do not add any genomic sequence but only adjust the position
|
|
3476 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
|
|
3477 $pos_2 += $len_2[$_];
|
|
3478 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
|
|
3479 }
|
|
3480 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
|
|
3481 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
|
|
3482 }
|
|
3483 else{
|
|
3484 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
|
|
3485 }
|
|
3486 }
|
|
3487
|
|
3488 ### 3' end of read 2
|
|
3489 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
|
|
3490 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3491 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
|
3
|
3492 # need to set read 1 as well now to prevent warning
|
|
3493 # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n";
|
|
3494 # sleep(5);
|
|
3495 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
|
0
|
3496 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
|
|
3497 return;
|
|
3498 }
|
|
3499 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
|
|
3500 }
|
|
3501
|
|
3502 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
|
|
3503 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
|
|
3504
|
|
3505 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
|
|
3506 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
|
|
3507 ### [Index 0, sequence originated from (converted) forward strand]
|
|
3508 $counting{CT_GA_CT_count}++;
|
|
3509 $alignment_read_1 = '+';
|
|
3510 $alignment_read_2 = '-';
|
|
3511 $read_conversion_info_1 = 'CT';
|
|
3512 $read_conversion_info_2 = 'GA';
|
|
3513 $genome_conversion = 'CT';
|
|
3514 ### Read 1 is always the forward hit
|
|
3515 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
|
|
3516 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
|
|
3517 }
|
|
3518
|
|
3519 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
|
|
3520 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
|
|
3521 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
|
|
3522 $counting{GA_CT_GA_count}++;
|
|
3523 $alignment_read_1 = '+';
|
|
3524 $alignment_read_2 = '-';
|
|
3525 $read_conversion_info_1 = 'GA';
|
|
3526 $read_conversion_info_2 = 'CT';
|
|
3527 $genome_conversion = 'GA';
|
|
3528 ### Read 1 is always the forward hit
|
|
3529 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
|
|
3530 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
|
|
3531 }
|
|
3532
|
|
3533 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
|
|
3534 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
|
|
3535 ### [Index 2, sequence originated from the complementary to (converted) top strand]
|
|
3536 $counting{GA_CT_CT_count}++;
|
|
3537 $alignment_read_1 = '-';
|
|
3538 $alignment_read_2 = '+';
|
|
3539 $read_conversion_info_1 = 'GA';
|
|
3540 $read_conversion_info_2 = 'CT';
|
|
3541 $genome_conversion = 'CT';
|
|
3542
|
|
3543 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
|
|
3544 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
|
|
3545 }
|
|
3546
|
|
3547 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
|
|
3548 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
|
|
3549 ### [Index 3, sequence originated from the (converted) reverse strand]
|
|
3550 $counting{CT_GA_GA_count}++;
|
|
3551 $alignment_read_1 = '-';
|
|
3552 $alignment_read_2 = '+';
|
|
3553 $read_conversion_info_1 = 'CT';
|
|
3554 $read_conversion_info_2 = 'GA';
|
|
3555 $genome_conversion = 'GA';
|
|
3556 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
|
|
3557 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
|
|
3558 }
|
|
3559 else{
|
|
3560 die "Too many bowtie result filehandles\n";
|
|
3561 }
|
|
3562 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
|
|
3563 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
|
|
3564
|
|
3565 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
|
|
3566 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
|
|
3567 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
|
|
3568 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
|
|
3569 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
|
|
3570 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
|
|
3571 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
|
|
3572 ## the end position of a read is stored in $pos
|
|
3573 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
|
|
3574 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
|
|
3575 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
|
|
3576 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
|
|
3577 }
|
|
3578
|
|
3579 ##########################################
|
|
3580 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
|
|
3581 ##########################################
|
|
3582
|
|
3583 sub print_bisulfite_mapping_result_single_end{
|
|
3584 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
|
|
3585
|
|
3586 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
|
|
3587 if ($phred64){
|
|
3588 $quality_value = convert_phred64_quals_to_phred33($quality_value);
|
|
3589 }
|
|
3590 elsif ($solexa){
|
|
3591 $quality_value = convert_solexa_quals_to_phred33($quality_value);
|
|
3592 }
|
|
3593
|
|
3594 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
|
|
3595 $methylation_call_params->{$identifier}->{position} += 1;
|
|
3596
|
|
3597 ### writing every uniquely mapped read and its methylation call to the output file
|
|
3598 if ($vanilla){
|
|
3599 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
|
|
3600 print OUT "$bowtie1_output\n";
|
|
3601 }
|
|
3602 else{ # SAM output, default since Bismark v1.0.0
|
|
3603 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
|
|
3604 }
|
|
3605 }
|
|
3606
|
|
3607 ##########################################
|
|
3608 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
|
|
3609 ##########################################
|
|
3610
|
|
3611 sub print_bisulfite_mapping_result_single_end_bowtie2{
|
|
3612 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
|
|
3613
|
|
3614 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
|
|
3615 if ($phred64){
|
|
3616 $quality_value = convert_phred64_quals_to_phred33($quality_value);
|
|
3617 }
|
|
3618 elsif ($solexa){
|
|
3619 $quality_value = convert_solexa_quals_to_phred33($quality_value);
|
|
3620 }
|
|
3621
|
|
3622 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
|
|
3623 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
|
|
3624 }
|
|
3625
|
|
3626 ##########################################
|
|
3627 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
|
|
3628 ##########################################
|
|
3629
|
|
3630 sub print_bisulfite_mapping_results_paired_ends{
|
|
3631 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
|
|
3632
|
|
3633 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
|
|
3634 if ($phred64){
|
|
3635 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
|
|
3636 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
|
|
3637 }
|
|
3638 elsif ($solexa){
|
|
3639 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
|
|
3640 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
|
|
3641 }
|
|
3642
|
|
3643 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
|
|
3644 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
|
|
3645
|
|
3646 ### writing every single aligned read and its methylation call to the output file
|
|
3647 if ($vanilla){
|
|
3648 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
|
|
3649 print OUT "$bowtie1_output_paired_end\n";
|
|
3650 }
|
|
3651 else{ # SAM output, default since Bismark v1.0.0
|
|
3652 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
|
|
3653 }
|
|
3654
|
|
3655 }
|
|
3656
|
|
3657 ##########################################
|
|
3658 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
|
|
3659 ##########################################
|
|
3660
|
|
3661 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
|
|
3662 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
|
|
3663
|
|
3664 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
|
|
3665 if ($phred64){
|
|
3666 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
|
|
3667 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
|
|
3668 }
|
|
3669 elsif ($solexa){
|
|
3670 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
|
|
3671 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
|
|
3672 }
|
|
3673
|
|
3674 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
|
|
3675 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
|
|
3676
|
|
3677 }
|
|
3678
|
|
3679
|
|
3680 sub convert_phred64_quals_to_phred33{
|
|
3681
|
|
3682 my $qual = shift;
|
|
3683 my @quals = split (//,$qual);
|
|
3684 my @new_quals;
|
|
3685
|
|
3686 foreach my $index (0..$#quals){
|
|
3687 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
|
|
3688 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
|
|
3689 $new_quals[$index] = $phred33_quality_string;
|
|
3690 }
|
|
3691
|
|
3692 my $phred33_quality = join ("",@new_quals);
|
|
3693 return $phred33_quality;
|
|
3694 }
|
|
3695
|
|
3696 sub convert_solexa_quals_to_phred33{
|
|
3697
|
|
3698 my $qual = shift;
|
|
3699 my @quals = split (//,$qual);
|
|
3700 my @new_quals;
|
|
3701
|
|
3702 foreach my $index (0..$#quals){
|
|
3703 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
|
|
3704 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
|
|
3705 $new_quals[$index] = $phred33_quality_string;
|
|
3706 }
|
|
3707
|
|
3708 my $phred33_quality = join ("",@new_quals);
|
|
3709 return $phred33_quality;
|
|
3710 }
|
|
3711
|
|
3712 sub convert_phred_score_into_phred33_quality_string{
|
|
3713 my $qual = shift;
|
|
3714 $qual = chr($qual+33);
|
|
3715 return $qual;
|
|
3716 }
|
|
3717
|
|
3718 sub convert_phred64_quality_string_into_phred_score{
|
|
3719 my $string = shift;
|
|
3720 my $qual = ord($string)-64;
|
|
3721 return $qual;
|
|
3722 }
|
|
3723
|
|
3724 sub convert_solexa_pre1_3_quality_string_into_phred_score{
|
|
3725 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
|
|
3726 my $string = shift;
|
|
3727 my $qual = ord($string)-59;
|
|
3728 return $qual;
|
|
3729 }
|
|
3730
|
|
3731
|
|
3732 sub extract_corresponding_genomic_sequence_single_end {
|
|
3733 my ($sequence_identifier,$methylation_call_params) = @_;
|
|
3734 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
|
|
3735 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
|
|
3736
|
|
3737 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
|
|
3738 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
|
|
3739 my $alignment_strand;
|
|
3740 my $read_conversion_info;
|
|
3741 my $genome_conversion;
|
|
3742 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
|
|
3743 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
|
|
3744 ### if the C happens to be at the last position of the actually observed sequence
|
|
3745 my $non_bisulfite_sequence;
|
|
3746 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
|
|
3747
|
|
3748 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
|
|
3749 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
|
|
3750 ### [Index 0, sequence originated from (converted) forward strand]
|
|
3751 $counting{CT_CT_count}++;
|
|
3752 $alignment_strand = '+';
|
|
3753 $read_conversion_info = 'CT';
|
|
3754 $genome_conversion = 'CT';
|
|
3755
|
|
3756 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3757 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
|
|
3758 ### + 2 extra base at the 3' end
|
|
3759 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
|
|
3760 }
|
|
3761 else{
|
|
3762 $non_bisulfite_sequence = '';
|
|
3763 }
|
|
3764 }
|
|
3765
|
|
3766 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
|
|
3767 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
|
|
3768 ### [Index 1, sequence originated from (converted) reverse strand]
|
|
3769 $counting{CT_GA_count}++;
|
|
3770 $alignment_strand = '-';
|
|
3771 $read_conversion_info = 'CT';
|
|
3772 $genome_conversion = 'GA';
|
|
3773
|
|
3774 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3775 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
|
|
3776 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
|
|
3777 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
|
|
3778 ## reverse complement!
|
|
3779 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
|
|
3780 }
|
|
3781 else{
|
|
3782 $non_bisulfite_sequence = '';
|
|
3783 }
|
|
3784 }
|
|
3785
|
|
3786 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
|
|
3787 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
|
|
3788 ### [Index 2, sequence originated from complementary to (converted) forward strand]
|
|
3789 $counting{GA_CT_count}++;
|
|
3790 $alignment_strand = '-';
|
|
3791 $read_conversion_info = 'GA';
|
|
3792 $genome_conversion = 'CT';
|
|
3793
|
|
3794 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
|
|
3795 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3796 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
|
|
3797 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
|
|
3798 ## reverse complement!
|
|
3799 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
|
|
3800 }
|
|
3801 else{
|
|
3802 $non_bisulfite_sequence = '';
|
|
3803 }
|
|
3804 }
|
|
3805
|
|
3806 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
|
|
3807 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
|
|
3808 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
|
|
3809 $counting{GA_GA_count}++;
|
|
3810 $alignment_strand = '+';
|
|
3811 $read_conversion_info = 'GA';
|
|
3812 $genome_conversion = 'GA';
|
|
3813
|
|
3814 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3815 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
|
|
3816 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
|
|
3817 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
|
|
3818 }
|
|
3819 else{
|
|
3820 $non_bisulfite_sequence = '';
|
|
3821 }
|
|
3822 }
|
|
3823 else{
|
|
3824 die "Too many bowtie result filehandles\n";
|
|
3825 }
|
|
3826
|
|
3827 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
|
|
3828 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
|
|
3829 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
|
|
3830 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
|
|
3831
|
|
3832 ### at this point we can also determine the end position of a read
|
|
3833 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
|
|
3834 }
|
|
3835
|
|
3836 sub extract_corresponding_genomic_sequence_single_end_pbat {
|
|
3837 my ($sequence_identifier,$methylation_call_params) = @_;
|
|
3838 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
|
|
3839 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
|
|
3840
|
|
3841 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
|
|
3842 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
|
|
3843 my $alignment_strand;
|
|
3844 my $read_conversion_info;
|
|
3845 my $genome_conversion;
|
|
3846 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
|
|
3847 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
|
|
3848 ### if the C happens to be at the last position of the actually observed sequence
|
|
3849 my $non_bisulfite_sequence;
|
|
3850 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
|
|
3851
|
|
3852 my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1!
|
|
3853
|
|
3854 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
|
|
3855 if ($pbat_index == 0){
|
|
3856 ### [Index 0, sequence originated from (converted) forward strand]
|
|
3857 $counting{CT_CT_count}++;
|
|
3858 $alignment_strand = '+';
|
|
3859 $read_conversion_info = 'CT';
|
|
3860 $genome_conversion = 'CT';
|
|
3861
|
|
3862 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3863 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
|
|
3864 ### + 2 extra base at the 3' end
|
|
3865 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
|
|
3866 }
|
|
3867 else{
|
|
3868 $non_bisulfite_sequence = '';
|
|
3869 }
|
|
3870 }
|
|
3871
|
|
3872 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
|
|
3873 elsif ($pbat_index == 1){
|
|
3874 ### [Index 1, sequence originated from (converted) reverse strand]
|
|
3875 $counting{CT_GA_count}++;
|
|
3876 $alignment_strand = '-';
|
|
3877 $read_conversion_info = 'CT';
|
|
3878 $genome_conversion = 'GA';
|
|
3879
|
|
3880 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3881 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
|
|
3882 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
|
|
3883 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
|
|
3884 ## reverse complement!
|
|
3885 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
|
|
3886 }
|
|
3887 else{
|
|
3888 $non_bisulfite_sequence = '';
|
|
3889 }
|
|
3890 }
|
|
3891
|
|
3892 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
|
|
3893 elsif ($pbat_index == 2){
|
|
3894 ### [Index 2, sequence originated from complementary to (converted) forward strand]
|
|
3895 $counting{GA_CT_count}++;
|
|
3896 $alignment_strand = '-';
|
|
3897 $read_conversion_info = 'GA';
|
|
3898 $genome_conversion = 'CT';
|
|
3899
|
|
3900 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
|
|
3901 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3902 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
|
|
3903 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
|
|
3904 ## reverse complement!
|
|
3905 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
|
|
3906 }
|
|
3907 else{
|
|
3908 $non_bisulfite_sequence = '';
|
|
3909 }
|
|
3910 }
|
|
3911
|
|
3912 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
|
|
3913 elsif ($pbat_index == 3){
|
|
3914 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
|
|
3915 $counting{GA_GA_count}++;
|
|
3916 $alignment_strand = '+';
|
|
3917 $read_conversion_info = 'GA';
|
|
3918 $genome_conversion = 'GA';
|
|
3919
|
|
3920 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3921 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
|
|
3922 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
|
|
3923 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
|
|
3924 }
|
|
3925 else{
|
|
3926 $non_bisulfite_sequence = '';
|
|
3927 }
|
|
3928 }
|
|
3929 else{
|
|
3930 die "Too many bowtie result filehandles\n";
|
|
3931 }
|
|
3932
|
|
3933 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
|
|
3934 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
|
|
3935 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
|
|
3936 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
|
|
3937
|
|
3938 ### at this point we can also determine the end position of a read
|
|
3939 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
|
|
3940 }
|
|
3941
|
|
3942
|
|
3943 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
|
|
3944 my ($sequence_identifier,$methylation_call_params) = @_;
|
|
3945
|
|
3946 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
|
|
3947 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
|
|
3948
|
|
3949 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
|
|
3950 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
|
|
3951
|
|
3952 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
|
|
3953 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
|
|
3954 my $alignment_strand;
|
|
3955 my $read_conversion_info;
|
|
3956 my $genome_conversion;
|
|
3957 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
|
|
3958 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
|
|
3959 my $non_bisulfite_sequence = '';
|
|
3960
|
|
3961 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
|
|
3962 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
|
|
3963
|
|
3964 # parsing CIGAR string
|
|
3965 my @len = split (/\D+/,$cigar); # storing the length per operation
|
|
3966 my @ops = split (/\d+/,$cigar); # storing the operation
|
|
3967 shift @ops; # remove the empty first element
|
|
3968 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
3969
|
|
3970 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
|
|
3971 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
|
|
3972 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
3973 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
|
|
3974 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
|
|
3975 return;
|
|
3976 }
|
|
3977 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
|
|
3978 }
|
|
3979 my $indels = 0;
|
|
3980
|
|
3981 foreach (0..$#len){
|
|
3982 if ($ops[$_] eq 'M'){
|
|
3983 #extracting genomic sequence
|
|
3984 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
|
|
3985 # adjusting position
|
|
3986 $pos += $len[$_];
|
|
3987 }
|
|
3988 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
|
|
3989 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
|
|
3990 $non_bisulfite_sequence .= 'N' x $len[$_];
|
|
3991 # warn "$non_bisulfite_sequence\n";
|
|
3992 # position doesn't need to be adjusting
|
|
3993 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
|
|
3994 }
|
|
3995 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
|
|
3996 # we do not add any genomic sequence but only adjust the position
|
|
3997 $pos += $len[$_];
|
|
3998 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
|
|
3999 }
|
|
4000 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
|
|
4001 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
|
|
4002 }
|
|
4003 else{
|
|
4004 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
|
|
4005 }
|
|
4006 }
|
|
4007
|
|
4008 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
|
|
4009 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
|
|
4010 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
|
|
4011 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
|
|
4012 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
|
|
4013 return;
|
|
4014 }
|
|
4015 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
|
|
4016 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
|
|
4017 }
|
|
4018
|
|
4019
|
|
4020
|
|
4021 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
|
|
4022 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
|
|
4023 ### [Index 0, sequence originated from (converted) forward strand]
|
|
4024 $counting{CT_CT_count}++;
|
|
4025 $alignment_strand = '+';
|
|
4026 $read_conversion_info = 'CT';
|
|
4027 $genome_conversion = 'CT';
|
|
4028 }
|
|
4029
|
|
4030 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
|
|
4031 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
|
|
4032 ### [Index 1, sequence originated from (converted) reverse strand]
|
|
4033 $counting{CT_GA_count}++;
|
|
4034 $alignment_strand = '-';
|
|
4035 $read_conversion_info = 'CT';
|
|
4036 $genome_conversion = 'GA';
|
|
4037
|
|
4038 ### reverse complement!
|
|
4039 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
|
|
4040 }
|
|
4041
|
|
4042 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
|
|
4043 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
|
|
4044 ### [Index 2, sequence originated from complementary to (converted) forward strand]
|
|
4045 $counting{GA_CT_count}++;
|
|
4046 $alignment_strand = '-';
|
|
4047 $read_conversion_info = 'GA';
|
|
4048 $genome_conversion = 'CT';
|
|
4049
|
|
4050 ### reverse complement!
|
|
4051 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
|
|
4052 }
|
|
4053
|
|
4054 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
|
|
4055 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
|
|
4056 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
|
|
4057 $counting{GA_GA_count}++;
|
|
4058 $alignment_strand = '+';
|
|
4059 $read_conversion_info = 'GA';
|
|
4060 $genome_conversion = 'GA';
|
|
4061
|
|
4062 }
|
|
4063 else{
|
|
4064 die "Too many Bowtie 2 result filehandles\n";
|
|
4065 }
|
|
4066
|
|
4067 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
|
|
4068 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
|
|
4069 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
|
|
4070 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
|
|
4071
|
|
4072 ### the end position of a read is stored in $pos
|
|
4073 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
|
|
4074 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
|
|
4075 }
|
|
4076
|
|
4077 ### METHYLATION CALL
|
|
4078
|
|
4079 sub methylation_call{
|
|
4080 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
|
|
4081 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
|
|
4082 my @seq = split(//,$sequence_actually_observed);
|
|
4083 my @genomic = split(//,$genomic_sequence);
|
|
4084 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
|
|
4085 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
|
|
4086 ### CpG, CHH or CHG context
|
|
4087
|
|
4088 #################################################################
|
|
4089 ### . for bases not involving cytosines ###
|
|
4090 ### X for methylated C in CHG context (was protected) ###
|
|
4091 ### x for not methylated C in CHG context (was converted) ###
|
|
4092 ### H for methylated C in CHH context (was protected) ###
|
|
4093 ### h for not methylated C in CHH context (was converted) ###
|
|
4094 ### Z for methylated C in CpG context (was protected) ###
|
|
4095 ### z for not methylated C in CpG context (was converted) ###
|
3
|
4096 ### U for methylated C in unknown context (was protected) ###
|
|
4097 ### u for not methylated C in unknwon context (was converted) ###
|
0
|
4098 #################################################################
|
|
4099
|
|
4100 my @match =();
|
|
4101 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
|
|
4102 my $methyl_CHH_count = 0;
|
|
4103 my $methyl_CHG_count = 0;
|
|
4104 my $methyl_CpG_count = 0;
|
3
|
4105 my $methyl_C_unknown_count = 0;
|
0
|
4106 my $unmethylated_CHH_count = 0;
|
|
4107 my $unmethylated_CHG_count = 0;
|
|
4108 my $unmethylated_CpG_count = 0;
|
3
|
4109 my $unmethylated_C_unknown_count = 0;
|
0
|
4110
|
|
4111 if ($read_conversion eq 'CT'){
|
|
4112 for my $index (0..$#seq) {
|
|
4113 if ($seq[$index] eq $genomic[$index]) {
|
|
4114 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
|
|
4115 if ($genomic[$index] eq 'C') {
|
|
4116 ### If the residue is a C we want to know if it was in CpG context or in any other context
|
|
4117 my $downstream_base = $genomic[$index+1];
|
|
4118
|
|
4119 if ($downstream_base eq 'G'){
|
|
4120 ++$methyl_CpG_count;
|
|
4121 push @match,'Z'; # protected C, methylated, in CpG context
|
|
4122 }
|
3
|
4123 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
|
|
4124 ++$methyl_C_unknown_count;
|
|
4125 push @match,'U'; # protected C, methylated, in Unknown context
|
|
4126 }
|
0
|
4127 else {
|
|
4128 ### C in not in CpG-context, determining the second downstream base context
|
|
4129 my $second_downstream_base = $genomic[$index+2];
|
|
4130
|
|
4131 if ($second_downstream_base eq 'G'){
|
|
4132 ++$methyl_CHG_count;
|
|
4133 push @match,'X'; # protected C, methylated, in CHG context
|
|
4134 }
|
3
|
4135 elsif ($second_downstream_base eq 'N'){
|
|
4136 ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
|
|
4137 push @match,'U'; # protected C, methylated, in Unknown context
|
|
4138 }
|
0
|
4139 else{
|
|
4140 ++$methyl_CHH_count;
|
|
4141 push @match,'H'; # protected C, methylated, in CHH context
|
|
4142 }
|
|
4143 }
|
|
4144 }
|
|
4145 else {
|
|
4146 push @match, '.';
|
|
4147 }
|
|
4148 }
|
|
4149 elsif ($seq[$index] ne $genomic[$index]) {
|
|
4150 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
|
|
4151 ### in the actually observed sequence
|
|
4152 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
|
|
4153 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
|
|
4154 my $downstream_base = $genomic[$index+1];
|
|
4155
|
|
4156 if ($downstream_base eq 'G'){
|
|
4157 ++$unmethylated_CpG_count;
|
|
4158 push @match,'z'; # converted C, not methylated, in CpG context
|
|
4159 }
|
3
|
4160 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
|
|
4161 ++$unmethylated_C_unknown_count;
|
|
4162 push @match,'u'; # converted C, not methylated, in Unknown context
|
|
4163 }
|
0
|
4164 else{
|
|
4165 ### C in not in CpG-context, determining the second downstream base context
|
|
4166 my $second_downstream_base = $genomic[$index+2];
|
|
4167
|
|
4168 if ($second_downstream_base eq 'G'){
|
|
4169 ++$unmethylated_CHG_count;
|
|
4170 push @match,'x'; # converted C, not methylated, in CHG context
|
|
4171 }
|
3
|
4172 elsif ($second_downstream_base eq 'N'){
|
|
4173 ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
|
|
4174 push @match,'u'; # converted C, not methylated, in Unknown context
|
|
4175 }
|
0
|
4176 else{
|
|
4177 ++$unmethylated_CHH_count;
|
|
4178 push @match,'h'; # converted C, not methylated, in CHH context
|
|
4179 }
|
|
4180 }
|
|
4181 }
|
|
4182 ### all other mismatches are not of interest for a methylation call
|
|
4183 else {
|
|
4184 push @match,'.';
|
|
4185 }
|
|
4186 }
|
|
4187 else{
|
|
4188 die "There can be only 2 possibilities\n";
|
|
4189 }
|
|
4190 }
|
|
4191 }
|
|
4192 elsif ($read_conversion eq 'GA'){
|
|
4193 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
|
|
4194
|
|
4195 for my $index (0..$#seq) {
|
|
4196 if ($seq[$index] eq $genomic[$index+2]) {
|
|
4197 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
|
|
4198 if ($genomic[$index+2] eq 'G') {
|
|
4199 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
|
|
4200 ### to look if the base upstream is a C
|
|
4201
|
|
4202 my $upstream_base = $genomic[$index+1];
|
|
4203
|
|
4204 if ($upstream_base eq 'C'){
|
|
4205 ++$methyl_CpG_count;
|
|
4206 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
|
|
4207 }
|
3
|
4208 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
|
|
4209 ++$methyl_C_unknown_count;
|
|
4210 push @match,'U'; # protected C on opposing strand, methylated, in Unknown context
|
|
4211 }
|
0
|
4212 else{
|
|
4213 ### C in not in CpG-context, determining the second upstream base context
|
|
4214 my $second_upstream_base = $genomic[$index];
|
|
4215
|
|
4216 if ($second_upstream_base eq 'C'){
|
|
4217 ++$methyl_CHG_count;
|
|
4218 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
|
|
4219 }
|
3
|
4220 elsif ($second_upstream_base eq 'N'){
|
|
4221 ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
|
|
4222 push @match,'U'; # protected C, methylated, in Unknown context
|
|
4223 }
|
0
|
4224 else{
|
|
4225 ++$methyl_CHH_count;
|
|
4226 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
|
|
4227 }
|
|
4228 }
|
|
4229 }
|
|
4230 else{
|
|
4231 push @match, '.';
|
|
4232 }
|
|
4233 }
|
|
4234 elsif ($seq[$index] ne $genomic[$index+2]) {
|
|
4235 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
|
|
4236 ### on the opposing strand, so G to A conversions in the actually observed sequence
|
|
4237 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
|
|
4238 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
|
|
4239 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
|
|
4240
|
|
4241 my $upstream_base = $genomic[$index+1];
|
|
4242
|
|
4243 if ($upstream_base eq 'C'){
|
|
4244 ++$unmethylated_CpG_count;
|
|
4245 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
|
|
4246 }
|
3
|
4247 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
|
|
4248 ++$unmethylated_C_unknown_count;
|
|
4249 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
|
|
4250 }
|
0
|
4251 else{
|
|
4252 ### C in not in CpG-context, determining the second upstream base context
|
|
4253 my $second_upstream_base = $genomic[$index];
|
|
4254
|
|
4255 if ($second_upstream_base eq 'C'){
|
|
4256 ++$unmethylated_CHG_count;
|
|
4257 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
|
|
4258 }
|
3
|
4259 elsif ($second_upstream_base eq 'N'){
|
|
4260 ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
|
|
4261 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
|
|
4262 }
|
0
|
4263 else{
|
|
4264 ++$unmethylated_CHH_count;
|
|
4265 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
|
|
4266 }
|
|
4267 }
|
|
4268 }
|
|
4269 ### all other mismatches are not of interest for a methylation call
|
|
4270 else {
|
|
4271 push @match,'.';
|
|
4272 }
|
|
4273 }
|
|
4274 else{
|
|
4275 die "There can be only 2 possibilities\n";
|
|
4276 }
|
|
4277 }
|
|
4278 }
|
|
4279 else{
|
|
4280 die "Strand conversion info is required to perform a methylation call\n";
|
|
4281 }
|
|
4282
|
|
4283 my $methylation_call = join ("",@match);
|
|
4284
|
|
4285 $counting{total_meCHH_count} += $methyl_CHH_count;
|
|
4286 $counting{total_meCHG_count} += $methyl_CHG_count;
|
|
4287 $counting{total_meCpG_count} += $methyl_CpG_count;
|
3
|
4288 $counting{total_meC_unknown_count} += $methyl_C_unknown_count;
|
0
|
4289 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
|
|
4290 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
|
|
4291 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
|
3
|
4292 $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count;
|
0
|
4293
|
|
4294 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
|
|
4295 return $methylation_call;
|
|
4296 }
|
|
4297
|
|
4298 sub read_genome_into_memory{
|
|
4299 ## working directoy
|
|
4300 my $cwd = shift;
|
|
4301 ## reading in and storing the specified genome in the %chromosomes hash
|
|
4302 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
|
|
4303 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
|
|
4304
|
|
4305 my @chromosome_filenames = <*.fa>;
|
|
4306
|
|
4307 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
|
|
4308 unless (@chromosome_filenames){
|
|
4309 @chromosome_filenames = <*.fasta>;
|
|
4310 }
|
|
4311
|
|
4312 unless (@chromosome_filenames){
|
|
4313 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
|
|
4314 }
|
|
4315
|
|
4316 foreach my $chromosome_filename (@chromosome_filenames){
|
|
4317
|
|
4318 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
|
|
4319 ### first line needs to be a fastA header
|
|
4320 my $first_line = <CHR_IN>;
|
|
4321 chomp $first_line;
|
|
4322 $first_line =~ s/\r//;
|
|
4323
|
|
4324 ### Extracting chromosome name from the FastA header
|
|
4325 my $chromosome_name = extract_chromosome_name($first_line);
|
|
4326
|
|
4327 my $sequence;
|
|
4328 while (<CHR_IN>){
|
|
4329 chomp;
|
|
4330 $_ =~ s/\r//;
|
|
4331 if ($_ =~ /^>/){
|
|
4332 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
|
|
4333 if (exists $chromosomes{$chromosome_name}){
|
|
4334 print "chr $chromosome_name (",length $sequence ," bp)\n";
|
|
4335 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
|
|
4336 }
|
|
4337 else {
|
|
4338 if (length($sequence) == 0){
|
|
4339 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
|
|
4340 }
|
|
4341 print "chr $chromosome_name (",length $sequence ," bp)\n";
|
|
4342 $chromosomes{$chromosome_name} = $sequence;
|
|
4343 }
|
|
4344 ### resetting the sequence variable
|
|
4345 $sequence = '';
|
|
4346 ### setting new chromosome name
|
|
4347 $chromosome_name = extract_chromosome_name($_);
|
|
4348 }
|
|
4349 else{
|
|
4350 $sequence .= uc$_;
|
|
4351 }
|
|
4352 }
|
|
4353
|
|
4354 if (exists $chromosomes{$chromosome_name}){
|
|
4355 print "chr $chromosome_name (",length $sequence ," bp)\t";
|
|
4356 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
|
|
4357 }
|
|
4358 else{
|
|
4359 if (length($sequence) == 0){
|
|
4360 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
|
|
4361 }
|
|
4362 print "chr $chromosome_name (",length $sequence ," bp)\n";
|
|
4363 $chromosomes{$chromosome_name} = $sequence;
|
|
4364 }
|
|
4365 }
|
|
4366 print "\n";
|
|
4367 chdir $cwd or die "Failed to move to directory $cwd\n";
|
|
4368 }
|
|
4369
|
|
4370 sub extract_chromosome_name {
|
|
4371 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
|
|
4372 my $fasta_header = shift;
|
|
4373 if ($fasta_header =~ s/^>//){
|
|
4374 my ($chromosome_name) = split (/\s+/,$fasta_header);
|
|
4375 return $chromosome_name;
|
|
4376 }
|
|
4377 else{
|
|
4378 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
|
|
4379 }
|
|
4380 }
|
|
4381
|
|
4382 sub reverse_complement{
|
|
4383 my $sequence = shift;
|
|
4384 $sequence =~ tr/CATG/GTAC/;
|
|
4385 $sequence = reverse($sequence);
|
|
4386 return $sequence;
|
|
4387 }
|
|
4388
|
|
4389 sub biTransformFastAFiles {
|
|
4390 my $file = shift;
|
|
4391 my ($dir,$filename);
|
|
4392 if ($file =~ /\//){
|
|
4393 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
|
|
4394 }
|
|
4395 else{
|
|
4396 $filename = $file;
|
|
4397 }
|
|
4398
|
|
4399 ### gzipped version of the infile
|
|
4400 if ($file =~ /\.gz$/){
|
|
4401 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
|
|
4402 }
|
|
4403 else{
|
|
4404 open (IN,$file) or die "Couldn't read from file $file: $!\n";
|
|
4405 }
|
|
4406
|
|
4407 if ($skip){
|
|
4408 warn "Skipping the first $skip reads from $file\n";
|
|
4409 sleep (1);
|
|
4410 }
|
|
4411 if ($upto){
|
|
4412 warn "Processing reads up to sequence no. $upto from $file\n";
|
|
4413 sleep (1);
|
|
4414 }
|
|
4415
|
|
4416 my $C_to_T_infile = my $G_to_A_infile = $filename;
|
|
4417
|
|
4418 if ($gzip){
|
|
4419 $C_to_T_infile =~ s/$/_C_to_T.fa.gz/;
|
|
4420 $G_to_A_infile =~ s/$/_G_to_A.fa.gz/;
|
|
4421 }
|
|
4422 else{
|
|
4423 $C_to_T_infile =~ s/$/_C_to_T.fa/;
|
|
4424 $G_to_A_infile =~ s/$/_G_to_A.fa/;
|
|
4425 }
|
|
4426
|
3
|
4427 if ($prefix){
|
|
4428 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
|
|
4429 $C_to_T_infile = "$prefix.$C_to_T_infile";
|
|
4430 $G_to_A_infile = "$prefix.$G_to_A_infile";
|
|
4431 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
|
|
4432 }
|
|
4433
|
0
|
4434 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
|
|
4435
|
|
4436 if ($gzip){
|
|
4437 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
|
|
4438 }
|
|
4439 else{
|
|
4440 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
|
|
4441 }
|
|
4442
|
|
4443 unless ($directional){
|
|
4444 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
|
|
4445 if ($gzip){
|
|
4446 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
|
|
4447 }
|
|
4448 else{
|
|
4449 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
|
|
4450 }
|
|
4451 }
|
|
4452
|
|
4453 my $count = 0;
|
|
4454
|
|
4455 while (1){
|
|
4456 my $header = <IN>;
|
|
4457 my $sequence= <IN>;
|
|
4458 last unless ($header and $sequence);
|
|
4459
|
|
4460 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
|
|
4461
|
|
4462 ++$count;
|
|
4463
|
|
4464 if ($skip){
|
|
4465 next unless ($count > $skip);
|
|
4466 }
|
|
4467 if ($upto){
|
|
4468 last if ($count > $upto);
|
|
4469 }
|
|
4470
|
|
4471 $sequence = uc$sequence; # make input file case insensitive
|
|
4472
|
|
4473 # detecting if the input file contains tab stops, as this is likely to result in no alignments
|
|
4474 if (index($header,"\t") != -1){
|
|
4475 $seqID_contains_tabs++;
|
|
4476 }
|
|
4477
|
|
4478 ### small check if the sequence seems to be in FastA format
|
|
4479 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
|
|
4480
|
|
4481 my $sequence_C_to_T = $sequence;
|
|
4482 $sequence_C_to_T =~ tr/C/T/;
|
|
4483 print CTOT "$header$sequence_C_to_T";
|
|
4484
|
|
4485 unless ($directional){
|
|
4486 my $sequence_G_to_A = $sequence;
|
|
4487 $sequence_G_to_A =~ tr/G/A/;
|
|
4488 print GTOA "$header$sequence_G_to_A";
|
|
4489 }
|
|
4490 }
|
|
4491 close CTOT or die "Failed to close filehandle $!\n";
|
|
4492
|
|
4493 if ($directional){
|
|
4494 warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
|
|
4495 }
|
|
4496 else{
|
|
4497 close GTOA or die "Failed to close filehandle $!\n";
|
|
4498 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
|
|
4499 }
|
|
4500 return ($C_to_T_infile,$G_to_A_infile);
|
|
4501 }
|
|
4502
|
|
4503 sub biTransformFastAFiles_paired_end {
|
|
4504 my ($file,$read_number) = @_;
|
|
4505
|
|
4506 if ($gzip){
|
|
4507 warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n";
|
|
4508 sleep (2);
|
|
4509 }
|
|
4510
|
|
4511 my ($dir,$filename);
|
|
4512 if ($file =~ /\//){
|
|
4513 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
|
|
4514 }
|
|
4515 else{
|
|
4516 $filename = $file;
|
|
4517 }
|
|
4518
|
|
4519 ### gzipped version of the infile
|
|
4520 if ($file =~ /\.gz$/){
|
|
4521 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
|
|
4522 }
|
|
4523 else{
|
|
4524 open (IN,$file) or die "Couldn't read from file $file: $!\n";
|
|
4525 }
|
|
4526
|
|
4527 if ($skip){
|
|
4528 warn "Skipping the first $skip reads from $file\n";
|
|
4529 sleep (1);
|
|
4530 }
|
|
4531 if ($upto){
|
|
4532 warn "Processing reads up to sequence no. $upto from $file\n";
|
|
4533 sleep (1);
|
|
4534 }
|
|
4535
|
|
4536 my $C_to_T_infile = my $G_to_A_infile = $filename;
|
3
|
4537
|
0
|
4538 $C_to_T_infile =~ s/$/_C_to_T.fa/;
|
|
4539 $G_to_A_infile =~ s/$/_G_to_A.fa/;
|
|
4540
|
3
|
4541 if ($prefix){
|
|
4542 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
|
|
4543 $C_to_T_infile = "$prefix.$C_to_T_infile";
|
|
4544 $G_to_A_infile = "$prefix.$G_to_A_infile";
|
|
4545 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
|
|
4546 }
|
|
4547
|
0
|
4548 if ($directional){
|
|
4549 if ($read_number == 1){
|
|
4550 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
|
|
4551 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
|
|
4552 }
|
|
4553 elsif ($read_number == 2){
|
|
4554 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
|
|
4555 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
|
|
4556 }
|
|
4557 else{
|
|
4558 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
|
|
4559 }
|
|
4560 }
|
|
4561 else{ # all four strand output
|
|
4562 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
|
|
4563 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
|
|
4564 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
|
|
4565 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
|
|
4566 }
|
|
4567
|
|
4568 my $count = 0;
|
|
4569
|
|
4570 while (1){
|
|
4571 my $header = <IN>;
|
|
4572 my $sequence= <IN>;
|
|
4573 last unless ($header and $sequence);
|
|
4574
|
|
4575 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
|
|
4576
|
|
4577 ++$count;
|
|
4578
|
|
4579 if ($skip){
|
|
4580 next unless ($count > $skip);
|
|
4581 }
|
|
4582 if ($upto){
|
|
4583 last if ($count > $upto);
|
|
4584 }
|
|
4585
|
|
4586 $sequence = uc$sequence; # make input file case insensitive
|
|
4587
|
|
4588 # detecting if the input file contains tab stops, as this is likely to result in no alignments
|
|
4589 if (index($header,"\t") != -1){
|
|
4590 $seqID_contains_tabs++;
|
|
4591 }
|
|
4592
|
|
4593 ## small check if the sequence seems to be in FastA format
|
|
4594 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/);
|
|
4595
|
|
4596 if ($read_number == 1){
|
|
4597 if ($bowtie2){
|
|
4598 $header =~ s/$/\/1\/1/;
|
|
4599 }
|
|
4600 else{
|
|
4601 $header =~ s/$/\/1/;
|
|
4602 }
|
|
4603 }
|
|
4604 elsif ($read_number == 2){
|
|
4605 if ($bowtie2){
|
|
4606 $header =~ s/$/\/2\/2/;
|
|
4607 }
|
|
4608 else{
|
|
4609 $header =~ s/$/\/2/;
|
|
4610 }
|
|
4611 }
|
|
4612 else{
|
|
4613 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
|
|
4614 }
|
|
4615 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
|
|
4616
|
|
4617 $sequence_C_to_T =~ tr/C/T/;
|
|
4618 $sequence_G_to_A =~ tr/G/A/;
|
|
4619
|
|
4620 if ($directional){
|
|
4621
|
|
4622 if ($read_number == 1){
|
|
4623 print CTOT "$header$sequence_C_to_T";
|
|
4624 }
|
|
4625 elsif ($read_number == 2){
|
|
4626 print GTOA "$header$sequence_G_to_A";
|
|
4627 }
|
|
4628 }
|
|
4629 else{
|
|
4630 print CTOT "$header$sequence_C_to_T";
|
|
4631 print GTOA "$header$sequence_G_to_A";
|
|
4632 }
|
|
4633 }
|
|
4634
|
|
4635 if ($directional){
|
|
4636 if ($read_number == 1){
|
|
4637 warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
|
|
4638 }
|
|
4639 else{
|
|
4640 warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
|
|
4641 }
|
|
4642 }
|
|
4643 else{
|
|
4644 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
|
|
4645 }
|
|
4646
|
|
4647 if ($directional){
|
|
4648 if ($read_number == 1){
|
|
4649 return ($C_to_T_infile);
|
|
4650 }
|
|
4651 else{
|
|
4652 return ($G_to_A_infile);
|
|
4653 }
|
|
4654 }
|
|
4655 else{
|
|
4656 return ($C_to_T_infile,$G_to_A_infile);
|
|
4657 }
|
|
4658 }
|
|
4659
|
|
4660
|
|
4661 sub biTransformFastQFiles {
|
|
4662 my $file = shift;
|
|
4663 my ($dir,$filename);
|
|
4664 if ($file =~ /\//){
|
|
4665 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
|
|
4666 }
|
|
4667 else{
|
|
4668 $filename = $file;
|
|
4669 }
|
|
4670
|
|
4671 ### gzipped version of the infile
|
|
4672 if ($file =~ /\.gz$/){
|
|
4673 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
|
|
4674 }
|
|
4675 else{
|
|
4676 open (IN,$file) or die "Couldn't read from file $file: $!\n";
|
|
4677 }
|
|
4678
|
|
4679 if ($skip){
|
|
4680 warn "Skipping the first $skip reads from $file\n";
|
|
4681 sleep (1);
|
|
4682 }
|
|
4683 if ($upto){
|
|
4684 warn "Processing reads up to sequence no. $upto from $file\n";
|
|
4685 sleep (1);
|
|
4686 }
|
|
4687
|
|
4688 my $C_to_T_infile = my $G_to_A_infile = $filename;
|
|
4689
|
3
|
4690 if ($prefix){
|
|
4691 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
|
|
4692 $C_to_T_infile = "$prefix.$C_to_T_infile";
|
|
4693 $G_to_A_infile = "$prefix.$G_to_A_infile";
|
|
4694 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
|
|
4695 }
|
|
4696
|
0
|
4697 if ($pbat){ # PBAT-Seq
|
|
4698 if ($gzip){
|
|
4699 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
|
|
4700 }
|
|
4701 else{
|
|
4702 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
|
|
4703 }
|
|
4704
|
|
4705 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
|
|
4706
|
|
4707 if ($gzip){
|
|
4708 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
|
|
4709 }
|
|
4710 else{
|
|
4711 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
|
|
4712 }
|
|
4713 }
|
|
4714 else{ # directional or non-directional
|
|
4715 if ($gzip){
|
|
4716 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
|
|
4717 }
|
|
4718 else{
|
|
4719 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
|
|
4720 }
|
|
4721
|
|
4722 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
|
|
4723
|
|
4724 if ($gzip){
|
|
4725 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
|
|
4726 }
|
|
4727 else{
|
|
4728 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option
|
|
4729 }
|
|
4730
|
|
4731 unless ($directional){
|
|
4732 if ($gzip){
|
|
4733 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
|
|
4734 }
|
|
4735 else{
|
|
4736 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
|
|
4737 }
|
|
4738
|
|
4739 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
|
|
4740
|
|
4741 if ($gzip){
|
|
4742 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
|
|
4743 }
|
|
4744 else{
|
|
4745 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
|
|
4746 }
|
|
4747 }
|
|
4748 }
|
|
4749
|
|
4750 my $count = 0;
|
|
4751 while (1){
|
|
4752 my $identifier = <IN>;
|
|
4753 my $sequence = <IN>;
|
|
4754 my $identifier2 = <IN>;
|
|
4755 my $quality_score = <IN>;
|
|
4756 last unless ($identifier and $sequence and $identifier2 and $quality_score);
|
|
4757
|
|
4758 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
|
|
4759
|
|
4760 ++$count;
|
|
4761
|
|
4762 if ($skip){
|
|
4763 next unless ($count > $skip);
|
|
4764 }
|
|
4765 if ($upto){
|
|
4766 last if ($count > $upto);
|
|
4767 }
|
|
4768
|
|
4769 $sequence = uc$sequence; # make input file case insensitive
|
|
4770
|
|
4771 # detecting if the input file contains tab stops, as this is likely to result in no alignments
|
|
4772 if (index($identifier,"\t") != -1){
|
|
4773 $seqID_contains_tabs++;
|
|
4774 }
|
|
4775
|
|
4776 ## small check if the sequence file appears to be a FastQ file
|
|
4777 if ($count == 1){
|
|
4778 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
|
|
4779 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
|
|
4780 }
|
|
4781 }
|
|
4782
|
|
4783 if ($pbat){
|
|
4784 my $sequence_G_to_A = $sequence;
|
|
4785 $sequence_G_to_A =~ tr/G/A/;
|
|
4786 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
|
|
4787 }
|
|
4788 else{ # directional or non-directional
|
|
4789 my $sequence_C_to_T = $sequence;
|
|
4790 $sequence_C_to_T =~ tr/C/T/;
|
|
4791 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
|
|
4792
|
|
4793 unless ($directional){
|
|
4794 my $sequence_G_to_A = $sequence;
|
|
4795 $sequence_G_to_A =~ tr/G/A/;
|
|
4796 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
|
|
4797 }
|
|
4798 }
|
|
4799 }
|
|
4800
|
|
4801 if ($directional){
|
|
4802 close CTOT or die "Failed to close filehandle $!\n";
|
|
4803 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
|
|
4804 }
|
|
4805 elsif($pbat){
|
|
4806 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
|
|
4807 close GTOA or die "Failed to close filehandle $!\n";
|
|
4808 return ($G_to_A_infile);
|
|
4809 }
|
|
4810 else{
|
|
4811 close CTOT or die "Failed to close filehandle $!\n";
|
|
4812 close GTOA or die "Failed to close filehandle $!\n";
|
|
4813 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
|
|
4814 }
|
|
4815
|
|
4816 return ($C_to_T_infile,$G_to_A_infile);
|
|
4817 }
|
|
4818
|
|
4819 sub biTransformFastQFiles_paired_end {
|
|
4820 my ($file,$read_number) = @_;
|
|
4821 my ($dir,$filename);
|
|
4822
|
|
4823 if ($file =~ /\//){
|
|
4824 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
|
|
4825 }
|
|
4826 else{
|
|
4827 $filename = $file;
|
|
4828 }
|
|
4829
|
|
4830 ### gzipped version of the infile
|
|
4831 if ($file =~ /\.gz$/){
|
|
4832 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
|
|
4833 }
|
|
4834 else{
|
|
4835 open (IN,$file) or die "Couldn't read from file $file: $!\n";
|
|
4836 }
|
|
4837
|
|
4838 if ($skip){
|
|
4839 warn "Skipping the first $skip reads from $file\n";
|
|
4840 sleep (1);
|
|
4841 }
|
|
4842 if ($upto){
|
|
4843 warn "Processing reads up to sequence no. $upto from $file\n";
|
|
4844 sleep (1);
|
|
4845 }
|
|
4846
|
|
4847 my $C_to_T_infile = my $G_to_A_infile = $filename;
|
|
4848
|
|
4849 if ($gzip){
|
|
4850 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
|
|
4851 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
|
|
4852 }
|
|
4853 else{
|
|
4854 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
|
|
4855 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
|
|
4856 }
|
|
4857
|
3
|
4858 if ($prefix){
|
|
4859 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
|
|
4860 $C_to_T_infile = "$prefix.$C_to_T_infile";
|
|
4861 $G_to_A_infile = "$prefix.$G_to_A_infile";
|
|
4862 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
|
|
4863 }
|
|
4864
|
0
|
4865 if ($directional){
|
|
4866 if ($read_number == 1){
|
|
4867 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
|
|
4868 if ($gzip){
|
|
4869 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
|
|
4870 }
|
|
4871 else{
|
|
4872 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
|
|
4873 }
|
|
4874 }
|
|
4875 elsif ($read_number == 2){
|
|
4876 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
|
|
4877 if ($gzip){
|
|
4878 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
|
|
4879 }
|
|
4880 else{
|
|
4881 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
|
|
4882 }
|
|
4883 }
|
|
4884 else{
|
|
4885 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
|
|
4886 }
|
|
4887 }
|
|
4888 else{
|
|
4889 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
|
|
4890 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
|
|
4891 if ($gzip){
|
|
4892 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
|
|
4893 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
|
|
4894 }
|
|
4895 else{
|
|
4896 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
|
|
4897 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
|
|
4898 }
|
|
4899 }
|
|
4900
|
|
4901 my $count = 0;
|
|
4902 while (1){
|
|
4903 my $identifier = <IN>;
|
|
4904 my $sequence = <IN>;
|
|
4905 my $identifier2 = <IN>;
|
|
4906 my $quality_score = <IN>;
|
|
4907 last unless ($identifier and $sequence and $identifier2 and $quality_score);
|
|
4908 ++$count;
|
|
4909
|
|
4910 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
|
|
4911
|
|
4912 if ($skip){
|
|
4913 next unless ($count > $skip);
|
|
4914 }
|
|
4915 if ($upto){
|
|
4916 last if ($count > $upto);
|
|
4917 }
|
|
4918
|
|
4919 $sequence= uc$sequence; # make input file case insensitive
|
|
4920
|
|
4921 ## small check if the sequence file appears to be a FastQ file
|
|
4922 if ($count == 1){
|
|
4923 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
|
|
4924 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
|
|
4925 }
|
|
4926 }
|
|
4927 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
|
|
4928
|
|
4929 if ($read_number == 1){
|
|
4930 if ($bowtie2){
|
|
4931 $identifier =~ s/$/\/1\/1/;
|
|
4932 }
|
|
4933 else{
|
|
4934 $identifier =~ s/$/\/1/;
|
|
4935 }
|
|
4936 }
|
|
4937 elsif ($read_number == 2){
|
|
4938 if ($bowtie2){
|
|
4939 $identifier =~ s/$/\/2\/2/;
|
|
4940 }
|
|
4941 else{
|
|
4942 $identifier =~ s/$/\/2/;
|
|
4943 }
|
|
4944 }
|
|
4945 else{
|
|
4946 die "Read number needs to be 1 or 2\n";
|
|
4947 }
|
|
4948
|
|
4949 $sequence_C_to_T =~ tr/C/T/;
|
|
4950 $sequence_G_to_A =~ tr/G/A/;
|
|
4951
|
|
4952 if ($directional){
|
|
4953 if ($read_number == 1){
|
|
4954 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
|
|
4955 }
|
|
4956 else{
|
|
4957 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
|
|
4958 }
|
|
4959 }
|
|
4960 else{
|
|
4961 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
|
|
4962 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
|
|
4963 }
|
|
4964 }
|
|
4965
|
|
4966 if ($directional){
|
|
4967 if ($read_number == 1){
|
|
4968 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
|
|
4969 }
|
|
4970 else{
|
|
4971 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
|
|
4972 }
|
|
4973 }
|
|
4974 else{
|
|
4975 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
|
|
4976 }
|
|
4977 if ($directional){
|
|
4978 if ($read_number == 1){
|
|
4979 close CTOT or die "Failed to close filehandle $!\n";
|
|
4980 return ($C_to_T_infile);
|
|
4981 }
|
|
4982 else{
|
|
4983 close GTOA or die "Failed to close filehandle $!\n";
|
|
4984 return ($G_to_A_infile);
|
|
4985 }
|
|
4986 }
|
|
4987 else{
|
|
4988 close CTOT or die "Failed to close filehandle $!\n";
|
|
4989 close GTOA or die "Failed to close filehandle $!\n";
|
|
4990 return ($C_to_T_infile,$G_to_A_infile);
|
|
4991 }
|
|
4992 }
|
|
4993
|
|
4994
|
|
4995 ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES
|
|
4996
|
|
4997 sub biTransformFastQFiles_paired_end_bowtie1_gzip {
|
|
4998 my ($file_1,$file_2) = @_;
|
|
4999 my ($dir,$filename);
|
|
5000
|
|
5001 if ($file_1 =~ /\//){
|
|
5002 ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/;
|
|
5003 }
|
|
5004 else{
|
|
5005 $filename = $file_1;
|
|
5006 }
|
|
5007
|
|
5008 ### gzipped version of infile 1
|
|
5009 if ($file_1 =~ /\.gz$/){
|
|
5010 open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n";
|
|
5011 }
|
|
5012 else{
|
|
5013 open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n";
|
|
5014 }
|
|
5015 ### gzipped version of infile 2
|
|
5016 if ($file_2 =~ /\.gz$/){
|
|
5017 open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n";
|
|
5018 }
|
|
5019 else{
|
|
5020 open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n";
|
|
5021 }
|
|
5022
|
|
5023
|
|
5024 if ($skip){
|
|
5025 warn "Skipping the first $skip reads from $file_1 and $file_2\n";
|
|
5026 sleep (1);
|
|
5027 }
|
|
5028 if ($upto){
|
|
5029 warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n";
|
|
5030 sleep (1);
|
|
5031 }
|
|
5032
|
|
5033 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename;
|
|
5034
|
3
|
5035 if ($prefix){
|
|
5036 # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n";
|
|
5037 $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile";
|
|
5038 $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile";
|
|
5039 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
|
|
5040 }
|
|
5041
|
0
|
5042 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/;
|
|
5043 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/;
|
3
|
5044 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
|
0
|
5045
|
|
5046 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n";
|
|
5047 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n";
|
|
5048 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n";
|
|
5049
|
|
5050 unless ($directional){
|
|
5051 print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n";
|
|
5052 open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n";
|
|
5053 }
|
|
5054
|
|
5055 ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format:
|
|
5056 ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate>
|
|
5057
|
|
5058 my $count = 0;
|
|
5059 while (1){
|
|
5060 my $identifier_1 = <IN_1>;
|
|
5061 my $sequence_1 = <IN_1>;
|
|
5062 my $identifier2_1 = <IN_1>;
|
|
5063 my $quality_score_1 = <IN_1>;
|
|
5064
|
|
5065 my $identifier_2 = <IN_2>;
|
|
5066 my $sequence_2 = <IN_2>;
|
|
5067 my $identifier2_2 = <IN_2>;
|
|
5068 my $quality_score_2 = <IN_2>;
|
|
5069
|
|
5070 last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2);
|
|
5071
|
|
5072 ++$count;
|
|
5073
|
|
5074 ## small check if the sequence file appears to be a FastQ file
|
|
5075 if ($count == 1){
|
|
5076 if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){
|
|
5077 die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n";
|
|
5078 }
|
|
5079 if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){
|
|
5080 die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n";
|
|
5081 }
|
|
5082 }
|
|
5083
|
|
5084 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
|
|
5085 chomp $identifier_1;
|
|
5086 chomp $sequence_1;
|
|
5087 chomp $sequence_2;
|
|
5088 chomp $quality_score_1;
|
|
5089 chomp $quality_score_2;
|
|
5090
|
|
5091 $identifier_1 =~ s/^\@//;
|
|
5092 $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever)
|
|
5093
|
|
5094 if ($skip){
|
|
5095 next unless ($count > $skip);
|
|
5096 }
|
|
5097 if ($upto){
|
|
5098 last if ($count > $upto);
|
|
5099 }
|
|
5100
|
|
5101 $sequence_1 = uc$sequence_1; # make input file 1 case insensitive
|
|
5102 $sequence_2 = uc$sequence_2; # make input file 2 case insensitive
|
|
5103
|
|
5104 # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n";
|
|
5105 my $sequence_1_C_to_T = $sequence_1;
|
|
5106 my $sequence_2_G_to_A = $sequence_2;
|
|
5107 $sequence_1_C_to_T =~ tr/C/T/;
|
|
5108 $sequence_2_G_to_A =~ tr/G/A/;
|
|
5109
|
|
5110 print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n";
|
|
5111
|
|
5112 unless ($directional){
|
|
5113 my $sequence_1_G_to_A = $sequence_1;
|
|
5114 my $sequence_2_C_to_T = $sequence_2;
|
|
5115 $sequence_1_G_to_A =~ tr/G/A/;
|
|
5116 $sequence_2_C_to_T =~ tr/C/T/;
|
|
5117 print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n";
|
|
5118 }
|
|
5119 }
|
|
5120
|
|
5121 close CTPLUSGA or die "Couldn't close filehandle\n";
|
|
5122 warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n";
|
|
5123
|
|
5124 if ($directional){
|
|
5125 warn "\n";
|
|
5126 return ($CT_plus_GA_infile);
|
|
5127 }
|
|
5128 else{
|
|
5129 close GAPLUSCT or die "Couldn't close filehandle\n";
|
|
5130 warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n";
|
|
5131 return ($CT_plus_GA_infile,$GA_plus_CT_infile);
|
|
5132 }
|
|
5133 }
|
|
5134
|
|
5135
|
|
5136 sub fix_IDs{
|
|
5137 my $id = shift;
|
|
5138 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
|
|
5139 return $id;
|
|
5140 }
|
|
5141
|
|
5142 sub ensure_sensical_alignment_orientation_single_end{
|
|
5143 my $index = shift; # index number if the sequence produced an alignment
|
|
5144 my $strand = shift;
|
|
5145 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
|
|
5146 my $orientation = 0;
|
|
5147 ##############################################################################################################
|
|
5148 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
|
|
5149 ## here we only want reads in the forward (+) orientation
|
|
5150 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
|
|
5151 ### if the alignment is (+) we count it, and return 1 for a correct orientation
|
|
5152 if ($strand eq '+') {
|
|
5153 $fhs[$index]->{seen}++;
|
|
5154 $orientation = 1;
|
|
5155 return $orientation;
|
|
5156 }
|
|
5157 ### if the orientation equals (-) the alignment is nonsensical
|
|
5158 elsif ($strand eq '-') {
|
|
5159 $fhs[$index]->{wrong_strand}++;
|
|
5160 return $orientation;
|
|
5161 }
|
|
5162 }
|
|
5163 ###############################################################################################################
|
|
5164 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
|
|
5165 ## here we only want reads in the forward (-) orientation
|
|
5166 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
|
|
5167 ### if the alignment is (-) we count it and return 1 for a correct orientation
|
|
5168 if ($strand eq '-') {
|
|
5169 $fhs[$index]->{seen}++;
|
|
5170 $orientation = 1;
|
|
5171 return $orientation;
|
|
5172 }
|
|
5173 ### if the orientation equals (+) the alignment is nonsensical
|
|
5174 elsif ($strand eq '+') {
|
|
5175 $fhs[$index]->{wrong_strand}++;
|
|
5176 return $orientation;
|
|
5177 }
|
|
5178 }
|
|
5179 ###############################################################################################################
|
|
5180 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
|
|
5181 ## here we only want reads in the forward (-) orientation
|
|
5182 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
|
|
5183 ### if the alignment is (-) we count it and return 1 for a correct orientation
|
|
5184 if ($strand eq '-') {
|
|
5185 $fhs[$index]->{seen}++;
|
|
5186 $orientation = 1;
|
|
5187 return $orientation;
|
|
5188 }
|
|
5189 ### if the orientation equals (+) the alignment is nonsensical
|
|
5190 elsif ($strand eq '+') {
|
|
5191 $fhs[$index]->{wrong_strand}++;
|
|
5192 return $orientation;
|
|
5193 }
|
|
5194 }
|
|
5195 ###############################################################################################################
|
|
5196 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
|
|
5197 ## here we only want reads in the forward (+) orientation
|
|
5198 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
|
|
5199 ### if the alignment is (+) we count it and return 1 for a correct orientation
|
|
5200 if ($strand eq '+') {
|
|
5201 $fhs[$index]->{seen}++;
|
|
5202 $orientation = 1;
|
|
5203 return $orientation;
|
|
5204 }
|
|
5205 ### if the orientation equals (-) the alignment is nonsensical
|
|
5206 elsif ($strand eq '-') {
|
|
5207 $fhs[$index]->{wrong_strand}++;
|
|
5208 return $orientation;
|
|
5209 }
|
|
5210 } else{
|
|
5211 die "One of the above conditions must be true\n";
|
|
5212 }
|
|
5213 }
|
|
5214
|
|
5215 sub ensure_sensical_alignment_orientation_paired_ends{
|
|
5216 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
|
|
5217 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
|
|
5218 my $orientation = 0;
|
|
5219 ##############################################################################################################
|
|
5220 ## [Index 0, sequence originated from (converted) forward strand]
|
|
5221 ## CT converted read 1
|
|
5222 ## GA converted read 2
|
|
5223 ## CT converted genome
|
|
5224 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
|
|
5225 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
|
|
5226 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
|
|
5227 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
|
|
5228 $fhs[$index]->{seen}++;
|
|
5229 $orientation = 1;
|
|
5230 return $orientation;
|
|
5231 }
|
|
5232 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
|
|
5233 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
|
|
5234 $fhs[$index]->{wrong_strand}++;
|
|
5235 return $orientation;
|
|
5236 }
|
|
5237 else{
|
|
5238 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
|
|
5239 }
|
|
5240 }
|
|
5241 ###############################################################################################################
|
|
5242 ## [Index 1, sequence originated from (converted) reverse strand]
|
|
5243 ## GA converted read 1
|
|
5244 ## CT converted read 2
|
|
5245 ## GA converted genome
|
|
5246 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
|
|
5247 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
|
|
5248 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
|
|
5249 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
|
|
5250 $fhs[$index]->{seen}++;
|
|
5251 $orientation = 1;
|
|
5252 return $orientation;
|
|
5253 }
|
|
5254 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
|
|
5255 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
|
|
5256 $fhs[$index]->{wrong_strand}++;
|
|
5257 return $orientation;
|
|
5258 }
|
|
5259 else{
|
|
5260 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
|
|
5261 }
|
|
5262 }
|
|
5263 ###############################################################################################################
|
|
5264 ## [Index 2, sequence originated from complementary to (converted) forward strand]
|
|
5265 ## GA converted read 1
|
|
5266 ## CT converted read 2
|
|
5267 ## CT converted genome
|
|
5268 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
|
|
5269 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
|
|
5270 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
|
|
5271 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
|
|
5272 $fhs[$index]->{seen}++;
|
|
5273 $orientation = 1;
|
|
5274 return $orientation;
|
|
5275 }
|
|
5276 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
|
|
5277 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
|
|
5278 $fhs[$index]->{wrong_strand}++;
|
|
5279 return $orientation;
|
|
5280 }
|
|
5281 else{
|
|
5282 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
|
|
5283 }
|
|
5284 }
|
|
5285 ###############################################################################################################
|
|
5286 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
|
|
5287 ## CT converted read 1
|
|
5288 ## GA converted read 2
|
|
5289 ## GA converted genome
|
|
5290 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
|
|
5291 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
|
|
5292 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
|
|
5293 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
|
|
5294 $fhs[$index]->{seen}++;
|
|
5295 $orientation = 1;
|
|
5296 return $orientation;
|
|
5297 }
|
|
5298 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
|
|
5299 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
|
|
5300 $fhs[$index]->{wrong_strand}++;
|
|
5301 return $orientation;
|
|
5302 }
|
|
5303 else{
|
|
5304 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
|
|
5305 }
|
|
5306 }
|
|
5307 else{
|
|
5308 die "One of the above conditions must be true\n";
|
|
5309 }
|
|
5310 }
|
|
5311
|
|
5312 #####################################################################################################################################################
|
|
5313
|
|
5314 ### Bowtie 1 (default) | PAIRED-END | FASTA
|
|
5315
|
|
5316 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
|
|
5317
|
|
5318 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
|
|
5319
|
|
5320 if ($directional){
|
|
5321 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
|
|
5322 }
|
|
5323 else{
|
|
5324 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
|
|
5325 }
|
|
5326
|
|
5327 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
|
|
5328 ## data structure above
|
|
5329 if ($directional){
|
|
5330 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5331 }
|
|
5332 else{
|
|
5333 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5334 }
|
|
5335
|
|
5336 foreach my $fh (@fhs) {
|
|
5337
|
|
5338 if ($directional){
|
|
5339 unless ($fh->{inputfile_1}){
|
|
5340 $fh->{last_seq_id} = undef;
|
|
5341 $fh->{last_line_1} = undef;
|
|
5342 $fh->{last_line_2} = undef;
|
|
5343 next;
|
|
5344 }
|
|
5345 }
|
|
5346
|
|
5347 my $bt_options = $bowtie_options;
|
|
5348 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
|
|
5349 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
|
|
5350 }
|
|
5351 else {
|
|
5352 $bt_options .= ' --nofw';
|
|
5353 }
|
|
5354
|
|
5355 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
|
|
5356 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
|
|
5357
|
|
5358 my $line_1 = $fh->{fh}->getline();
|
|
5359 my $line_2 = $fh->{fh}->getline();
|
|
5360
|
|
5361 # if Bowtie produces an alignment we store the first line of the output
|
|
5362 if ($line_1 and $line_2) {
|
|
5363 chomp $line_1;
|
|
5364 chomp $line_2;
|
|
5365 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
|
|
5366 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
|
|
5367
|
|
5368 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
|
|
5369 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
|
|
5370
|
|
5371 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
|
|
5372 $fh->{last_seq_id} = $id_1;
|
|
5373 }
|
|
5374 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
|
|
5375 $fh->{last_seq_id} = $id_2;
|
|
5376 }
|
|
5377 else{
|
|
5378 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
|
|
5379 }
|
|
5380
|
|
5381 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
|
|
5382 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
|
|
5383 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
|
|
5384 }
|
|
5385 # otherwise we just initialise last_seq_id and last_lines as undefined
|
|
5386 else {
|
|
5387 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
|
|
5388 $fh->{last_seq_id} = undef;
|
|
5389 $fh->{last_line_1} = undef;
|
|
5390 $fh->{last_line_2} = undef;
|
|
5391 }
|
|
5392 }
|
|
5393 }
|
|
5394
|
|
5395 ### Bowtie 2 | PAIRED-END | FASTA
|
|
5396
|
|
5397 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
|
|
5398 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
|
|
5399 if ($directional){
|
|
5400 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
|
|
5401 }
|
|
5402 else{
|
|
5403 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
|
|
5404 }
|
|
5405
|
|
5406 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
|
|
5407 ## data structure above
|
|
5408 if ($directional){
|
|
5409 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5410 }
|
|
5411 else{
|
|
5412 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5413 }
|
|
5414
|
|
5415 foreach my $fh (@fhs) {
|
|
5416
|
|
5417 if ($directional){
|
|
5418 unless ($fh->{inputfile_1}){
|
|
5419 $fh->{last_seq_id} = undef;
|
|
5420 $fh->{last_line_1} = undef;
|
|
5421 $fh->{last_line_2} = undef;
|
|
5422 next;
|
|
5423 }
|
|
5424 }
|
|
5425
|
|
5426 my $bt2_options = $bowtie_options;
|
|
5427 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
|
|
5428 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
|
|
5429 }
|
|
5430 else {
|
|
5431 $bt2_options .= ' --nofw';
|
|
5432 }
|
|
5433
|
|
5434 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
|
|
5435 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
|
|
5436
|
|
5437 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
|
|
5438 while (1){
|
|
5439 $_ = $fh->{fh}->getline();
|
|
5440 if ($_) {
|
|
5441 last unless ($_ =~ /^\@/); # SAM headers start with @
|
|
5442 }
|
|
5443 else{
|
|
5444 last; # no alignment output
|
|
5445 }
|
|
5446 }
|
|
5447
|
|
5448 my $line_1 = $_;
|
|
5449 my $line_2 = $fh->{fh}->getline();
|
|
5450
|
|
5451 # if Bowtie produces an alignment we store the first line of the output
|
|
5452 if ($line_1 and $line_2) {
|
|
5453 chomp $line_1;
|
|
5454 chomp $line_2;
|
|
5455 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
|
|
5456 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
|
|
5457
|
|
5458 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
|
|
5459 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
|
|
5460
|
|
5461 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
|
|
5462 $fh->{last_seq_id} = $id_1;
|
|
5463 }
|
|
5464 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
|
|
5465 $fh->{last_seq_id} = $id_2;
|
|
5466 }
|
|
5467 else{
|
|
5468 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
|
|
5469 }
|
|
5470
|
|
5471 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
|
|
5472 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
|
|
5473 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
|
|
5474 }
|
|
5475 # otherwise we just initialise last_seq_id and last_lines as undefined
|
|
5476 else {
|
|
5477 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
|
|
5478 $fh->{last_seq_id} = undef;
|
|
5479 $fh->{last_line_1} = undef;
|
|
5480 $fh->{last_line_2} = undef;
|
|
5481 }
|
|
5482 }
|
|
5483 }
|
|
5484
|
|
5485 ### Bowtie 1 (default) | PAIRED-END | FASTQ
|
|
5486
|
|
5487 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
|
|
5488 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
|
|
5489
|
|
5490 if ($directional){
|
|
5491 warn "Input file is $C_to_T_infile_1 (FastQ)\n";
|
|
5492 }
|
|
5493 elsif($pbat){
|
|
5494 warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n";
|
|
5495 }
|
|
5496 else{
|
|
5497 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n";
|
|
5498 }
|
|
5499
|
|
5500 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
|
|
5501 ## data structure above
|
|
5502 if ($directional or $pbat){
|
|
5503 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5504 }
|
|
5505 else{
|
|
5506 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5507 }
|
|
5508
|
|
5509 foreach my $fh (@fhs) {
|
|
5510
|
|
5511 if ($directional or $pbat){
|
|
5512 unless ($fh->{inputfile_1}){
|
|
5513 $fh->{last_seq_id} = undef;
|
|
5514 $fh->{last_line_1} = undef;
|
|
5515 $fh->{last_line_2} = undef;
|
|
5516 next; # skipping unwanted filehandles
|
|
5517 }
|
|
5518 }
|
|
5519
|
|
5520 my $bt_options = $bowtie_options;
|
|
5521 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
|
|
5522 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
|
|
5523 }
|
|
5524 else {
|
|
5525 $bt_options .= ' --nofw';
|
|
5526 }
|
|
5527
|
|
5528 if ($gzip){
|
|
5529 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n";
|
|
5530 open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!";
|
|
5531 }
|
|
5532 else{
|
|
5533 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n";
|
|
5534 sleep(5);
|
|
5535 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
|
|
5536 }
|
|
5537
|
|
5538 my $line_1 = $fh->{fh}->getline();
|
|
5539 my $line_2 = $fh->{fh}->getline();
|
|
5540
|
|
5541 # if Bowtie produces an alignment we store the first line of the output
|
|
5542 if ($line_1 and $line_2) {
|
|
5543 chomp $line_1;
|
|
5544 chomp $line_2;
|
|
5545 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
|
|
5546 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
|
|
5547
|
|
5548 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
|
|
5549 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
|
|
5550
|
|
5551 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
|
|
5552 $fh->{last_seq_id} = $id_1;
|
|
5553 }
|
|
5554 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
|
|
5555 $fh->{last_seq_id} = $id_2;
|
|
5556 }
|
|
5557 else{
|
|
5558 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
|
|
5559 }
|
|
5560
|
|
5561 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
|
|
5562 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
|
|
5563 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
|
|
5564 }
|
|
5565
|
|
5566 # otherwise we just initialise last_seq_id and last_lines as undefined
|
|
5567 else {
|
|
5568 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
|
|
5569 $fh->{last_seq_id} = undef;
|
|
5570 $fh->{last_line_1} = undef;
|
|
5571 $fh->{last_line_2} = undef;
|
|
5572 }
|
|
5573 }
|
|
5574 }
|
|
5575
|
|
5576 ### Bowtie 2 | PAIRED-END | FASTQ
|
|
5577
|
|
5578 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
|
|
5579 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
|
|
5580 if ($directional){
|
|
5581 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
|
|
5582 }
|
|
5583 else{
|
|
5584 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
|
|
5585 }
|
|
5586
|
|
5587 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
|
|
5588 ## data structure above
|
|
5589 if ($directional){
|
|
5590 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5591 }
|
|
5592 else{
|
|
5593 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5594 }
|
|
5595
|
|
5596 foreach my $fh (@fhs) {
|
|
5597
|
|
5598 if ($directional){
|
|
5599 unless ($fh->{inputfile_1}){
|
|
5600 $fh->{last_seq_id} = undef;
|
|
5601 $fh->{last_line_1} = undef;
|
|
5602 $fh->{last_line_2} = undef;
|
|
5603 next;
|
|
5604 }
|
|
5605 }
|
|
5606
|
|
5607 my $bt2_options = $bowtie_options;
|
|
5608 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
|
|
5609 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
|
|
5610 }
|
|
5611 else {
|
|
5612 $bt2_options .= ' --nofw';
|
|
5613 }
|
|
5614
|
|
5615 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
|
|
5616 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
|
|
5617
|
|
5618 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
|
|
5619 while (1){
|
|
5620 $_ = $fh->{fh}->getline();
|
|
5621 if ($_) {
|
|
5622 last unless ($_ =~ /^\@/); # SAM headers start with @
|
|
5623 }
|
|
5624 else{
|
|
5625 last; # no alignment output
|
|
5626 }
|
|
5627 }
|
|
5628
|
|
5629 my $line_1 = $_;
|
|
5630 my $line_2 = $fh->{fh}->getline();
|
|
5631
|
|
5632 # if Bowtie produces an alignment we store the first line of the output
|
|
5633 if ($line_1 and $line_2) {
|
|
5634 chomp $line_1;
|
|
5635 chomp $line_2;
|
|
5636 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
|
|
5637 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
|
|
5638
|
|
5639 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
|
|
5640 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
|
|
5641
|
|
5642 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
|
|
5643 $fh->{last_seq_id} = $id_1;
|
|
5644 }
|
|
5645 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
|
|
5646 $fh->{last_seq_id} = $id_2;
|
|
5647 }
|
|
5648 else{
|
|
5649 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
|
|
5650 }
|
|
5651
|
|
5652 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
|
|
5653 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
|
|
5654 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
|
|
5655 }
|
|
5656
|
|
5657 # otherwise we just initialise last_seq_id and last_lines as undefined
|
|
5658 else {
|
|
5659 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
|
|
5660 $fh->{last_seq_id} = undef;
|
|
5661 $fh->{last_line_1} = undef;
|
|
5662 $fh->{last_line_2} = undef;
|
|
5663 }
|
|
5664 }
|
|
5665 }
|
|
5666
|
|
5667 #####################################################################################################################################################
|
|
5668
|
|
5669 ### Bowtie 1 (default) | SINGLE-END | FASTA
|
|
5670 sub single_end_align_fragments_to_bisulfite_genome_fastA {
|
|
5671 my ($C_to_T_infile,$G_to_A_infile) = @_;
|
|
5672 if ($directional){
|
|
5673 warn "Input file is $C_to_T_infile (FastA)\n";
|
|
5674 }
|
|
5675 else{
|
|
5676 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
|
|
5677 }
|
|
5678
|
|
5679 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
|
|
5680 ## data structure above
|
|
5681 if ($directional){
|
|
5682 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5683 }
|
|
5684 else{
|
|
5685 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5686 }
|
|
5687
|
|
5688 foreach my $fh (@fhs) {
|
|
5689
|
|
5690 my $bt_options = $bowtie_options;
|
|
5691 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
|
|
5692 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
|
|
5693 }
|
|
5694 else {
|
|
5695 $bt_options .= ' --nofw';
|
|
5696 }
|
|
5697
|
|
5698 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
|
|
5699 if ($gzip){
|
|
5700 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
|
|
5701 }
|
|
5702 else{
|
|
5703 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
|
|
5704 }
|
|
5705
|
|
5706 # if Bowtie produces an alignment we store the first line of the output
|
|
5707 $_ = $fh->{fh}->getline();
|
|
5708 if ($_) {
|
|
5709 chomp;
|
|
5710 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
|
|
5711 $fh->{last_seq_id} = $id;
|
|
5712 $fh->{last_line} = $_;
|
|
5713 warn "Found first alignment:\t$fh->{last_line}\n";
|
|
5714 }
|
|
5715 # otherwise we just initialise last_seq_id and last_line as undefined
|
|
5716 else {
|
|
5717 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
|
|
5718 $fh->{last_seq_id} = undef;
|
|
5719 $fh->{last_line} = undef;
|
|
5720 }
|
|
5721 }
|
|
5722 }
|
|
5723
|
|
5724 ### Bowtie 2 | SINGLE-END | FASTA
|
|
5725 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
|
|
5726 my ($C_to_T_infile,$G_to_A_infile) = @_;
|
|
5727 if ($directional){
|
|
5728 warn "Input file is $C_to_T_infile (FastA)\n";
|
|
5729 }
|
|
5730 else{
|
|
5731 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
|
|
5732 }
|
|
5733
|
|
5734 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
|
|
5735 ## data structure above
|
|
5736 if ($directional){
|
|
5737 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5738 }
|
|
5739 else{
|
|
5740 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5741 }
|
|
5742
|
|
5743 foreach my $fh (@fhs) {
|
|
5744
|
|
5745 my $bt2_options = $bowtie_options;
|
|
5746 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
|
|
5747 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
|
|
5748 }
|
|
5749 else {
|
|
5750 $bt2_options .= ' --nofw';
|
|
5751 }
|
|
5752
|
|
5753 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
|
|
5754 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
|
|
5755
|
|
5756 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
|
|
5757 while (1){
|
|
5758 $_ = $fh->{fh}->getline();
|
|
5759 if ($_) {
|
|
5760 last unless ($_ =~ /^\@/); # SAM headers start with @
|
|
5761 }
|
|
5762 else{
|
|
5763 last; # no alignment output
|
|
5764 }
|
|
5765 }
|
|
5766
|
|
5767 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
|
|
5768 if ($_) {
|
|
5769 chomp;
|
|
5770 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
|
|
5771 $fh->{last_seq_id} = $id;
|
|
5772 $fh->{last_line} = $_;
|
|
5773 warn "Found first alignment:\t$fh->{last_line}\n";
|
|
5774 }
|
|
5775 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
|
|
5776 else {
|
|
5777 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
|
|
5778 $fh->{last_seq_id} = undef;
|
|
5779 $fh->{last_line} = undef;
|
|
5780 }
|
|
5781 }
|
|
5782 }
|
|
5783
|
|
5784
|
|
5785 ### Bowtie 1 (default) | SINGLE-END | FASTQ
|
|
5786 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
|
|
5787 my ($C_to_T_infile,$G_to_A_infile) = @_;
|
|
5788 if ($directional){
|
|
5789 warn "Input file is $C_to_T_infile (FastQ)\n";
|
|
5790 }
|
|
5791 elsif($pbat){
|
|
5792 warn "Input file is $G_to_A_infile (FastQ)\n";
|
|
5793 }
|
|
5794 else{
|
|
5795 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
|
|
5796 }
|
|
5797
|
|
5798
|
|
5799 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
|
|
5800 ## the data structure above
|
|
5801 if ($directional or $pbat){
|
|
5802 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5803 }
|
|
5804 else{
|
|
5805 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5806 }
|
|
5807
|
|
5808 foreach my $fh (@fhs) {
|
|
5809 my $bt_options = $bowtie_options;
|
|
5810 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
|
|
5811 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
|
|
5812 }
|
|
5813 else {
|
|
5814 $bt_options .= ' --nofw';
|
|
5815 }
|
|
5816
|
|
5817 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
|
|
5818 sleep (5);
|
|
5819
|
|
5820 if ($gzip){
|
|
5821 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
|
|
5822 }
|
|
5823 else{
|
|
5824 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
|
|
5825 }
|
|
5826
|
|
5827 # if Bowtie produces an alignment we store the first line of the output
|
|
5828 $_ = $fh->{fh}->getline();
|
|
5829 if ($_) {
|
|
5830 chomp;
|
|
5831 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
|
|
5832 $fh->{last_seq_id} = $id;
|
|
5833 $fh->{last_line} = $_;
|
|
5834 warn "Found first alignment:\t$fh->{last_line}\n";
|
|
5835 }
|
|
5836 # otherwise we just initialise last_seq_id and last_line as undefined
|
|
5837 else {
|
|
5838 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
|
|
5839 $fh->{last_seq_id} = undef;
|
|
5840 $fh->{last_line} = undef;
|
|
5841 }
|
|
5842 }
|
|
5843 }
|
|
5844
|
|
5845 ### Bowtie 2 | SINGLE-END | FASTQ
|
|
5846 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
|
|
5847
|
|
5848 my ($C_to_T_infile,$G_to_A_infile) = @_;
|
|
5849 if ($directional){
|
|
5850 warn "Input file is $C_to_T_infile (FastQ)\n\n";
|
|
5851 }
|
|
5852 else{
|
|
5853 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
|
|
5854 }
|
|
5855
|
|
5856 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
|
|
5857 ## the data structure above
|
|
5858 if ($directional){
|
|
5859 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5860 }
|
|
5861 else{
|
|
5862 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
|
|
5863 }
|
|
5864 foreach my $fh (@fhs) {
|
|
5865 my $bt2_options = $bowtie_options;
|
|
5866 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
|
|
5867 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
|
|
5868 }
|
|
5869 else {
|
|
5870 $bt2_options .= ' --nofw';
|
|
5871 }
|
|
5872 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
|
|
5873 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
|
|
5874
|
|
5875 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
|
|
5876 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
|
|
5877 while (1){
|
|
5878 $_ = $fh->{fh}->getline();
|
|
5879 # warn "$_\n";
|
|
5880 # sleep(1);
|
|
5881 if ($_) {
|
|
5882 last unless ($_ =~ /^\@/); # SAM headers start with @
|
|
5883 }
|
|
5884 else {
|
|
5885 last;
|
|
5886 }
|
|
5887 }
|
|
5888
|
|
5889 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
|
|
5890 if ($_) {
|
|
5891 chomp;
|
|
5892 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
|
|
5893 $fh->{last_seq_id} = $id;
|
|
5894 $fh->{last_line} = $_;
|
|
5895 warn "Found first alignment:\t$fh->{last_line}\n";
|
|
5896 # warn "storing $id and\n$_\n";
|
|
5897 }
|
|
5898 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
|
|
5899 else {
|
|
5900 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
|
|
5901 $fh->{last_seq_id} = undef;
|
|
5902 $fh->{last_line} = undef;
|
|
5903 }
|
|
5904 }
|
|
5905 }
|
|
5906
|
|
5907 ###########################################################################################################################################
|
|
5908
|
|
5909 sub reset_counters_and_fhs{
|
|
5910 my $filename = shift;
|
|
5911 %counting=(
|
|
5912 total_meCHH_count => 0,
|
|
5913 total_meCHG_count => 0,
|
|
5914 total_meCpG_count => 0,
|
3
|
5915 total_meC_unknown_count => 0,
|
0
|
5916 total_unmethylated_CHH_count => 0,
|
|
5917 total_unmethylated_CHG_count => 0,
|
|
5918 total_unmethylated_CpG_count => 0,
|
3
|
5919 total_unmethylated_C_unknown_count => 0,
|
0
|
5920 sequences_count => 0,
|
|
5921 no_single_alignment_found => 0,
|
|
5922 unsuitable_sequence_count => 0,
|
|
5923 genomic_sequence_could_not_be_extracted_count => 0,
|
|
5924 unique_best_alignment_count => 0,
|
|
5925 low_complexity_alignments_overruled_count => 0,
|
|
5926 CT_CT_count => 0, #(CT read/CT genome, original top strand)
|
|
5927 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
|
|
5928 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
|
|
5929 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
|
|
5930 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
|
|
5931 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
|
|
5932 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
|
|
5933 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
|
|
5934 alignments_rejected_count => 0, # only relevant if --directional was specified
|
|
5935 );
|
|
5936
|
|
5937 if ($directional){
|
|
5938 if ($filename =~ ','){ # paired-end files
|
|
5939 @fhs=(
|
|
5940 { name => 'CTreadCTgenome',
|
|
5941 strand_identity => 'con ori forward',
|
|
5942 bisulfiteIndex => $CT_index_basename,
|
|
5943 seen => 0,
|
|
5944 wrong_strand => 0,
|
|
5945 },
|
|
5946 { name => 'CTreadGAgenome',
|
|
5947 strand_identity => 'con ori reverse',
|
|
5948 bisulfiteIndex => $GA_index_basename,
|
|
5949 seen => 0,
|
|
5950 wrong_strand => 0,
|
|
5951 },
|
|
5952 { name => 'GAreadCTgenome',
|
|
5953 strand_identity => 'compl ori con forward',
|
|
5954 bisulfiteIndex => $CT_index_basename,
|
|
5955 seen => 0,
|
|
5956 wrong_strand => 0,
|
|
5957 },
|
|
5958 { name => 'GAreadGAgenome',
|
|
5959 strand_identity => 'compl ori con reverse',
|
|
5960 bisulfiteIndex => $GA_index_basename,
|
|
5961 seen => 0,
|
|
5962 wrong_strand => 0,
|
|
5963 },
|
|
5964 );
|
|
5965 }
|
|
5966 else{ # single-end files
|
|
5967 @fhs=(
|
|
5968 { name => 'CTreadCTgenome',
|
|
5969 strand_identity => 'con ori forward',
|
|
5970 bisulfiteIndex => $CT_index_basename,
|
|
5971 seen => 0,
|
|
5972 wrong_strand => 0,
|
|
5973 },
|
|
5974 { name => 'CTreadGAgenome',
|
|
5975 strand_identity => 'con ori reverse',
|
|
5976 bisulfiteIndex => $GA_index_basename,
|
|
5977 seen => 0,
|
|
5978 wrong_strand => 0,
|
|
5979 },
|
|
5980 );
|
|
5981 }
|
|
5982 }
|
|
5983 elsif($pbat){
|
|
5984 if ($filename =~ ','){ # paired-end files
|
|
5985 @fhs=(
|
|
5986 { name => 'CTreadCTgenome',
|
|
5987 strand_identity => 'con ori forward',
|
|
5988 bisulfiteIndex => $CT_index_basename,
|
|
5989 seen => 0,
|
|
5990 wrong_strand => 0,
|
|
5991 },
|
|
5992 { name => 'CTreadGAgenome',
|
|
5993 strand_identity => 'con ori reverse',
|
|
5994 bisulfiteIndex => $GA_index_basename,
|
|
5995 seen => 0,
|
|
5996 wrong_strand => 0,
|
|
5997 },
|
|
5998 { name => 'GAreadCTgenome',
|
|
5999 strand_identity => 'compl ori con forward',
|
|
6000 bisulfiteIndex => $CT_index_basename,
|
|
6001 seen => 0,
|
|
6002 wrong_strand => 0,
|
|
6003 },
|
|
6004 { name => 'GAreadGAgenome',
|
|
6005 strand_identity => 'compl ori con reverse',
|
|
6006 bisulfiteIndex => $GA_index_basename,
|
|
6007 seen => 0,
|
|
6008 wrong_strand => 0,
|
|
6009 },
|
|
6010 );
|
|
6011 }
|
|
6012 else{ # single-end files
|
|
6013 @fhs=(
|
|
6014 { name => 'GAreadCTgenome',
|
|
6015 strand_identity => 'compl ori con forward',
|
|
6016 bisulfiteIndex => $CT_index_basename,
|
|
6017 seen => 0,
|
|
6018 wrong_strand => 0,
|
|
6019 },
|
|
6020 { name => 'GAreadGAgenome',
|
|
6021 strand_identity => 'compl ori con reverse',
|
|
6022 bisulfiteIndex => $GA_index_basename,
|
|
6023 seen => 0,
|
|
6024 wrong_strand => 0,
|
|
6025 },
|
|
6026 );
|
|
6027 }
|
|
6028 }
|
|
6029 else{
|
|
6030 @fhs=(
|
|
6031 { name => 'CTreadCTgenome',
|
|
6032 strand_identity => 'con ori forward',
|
|
6033 bisulfiteIndex => $CT_index_basename,
|
|
6034 seen => 0,
|
|
6035 wrong_strand => 0,
|
|
6036 },
|
|
6037 { name => 'CTreadGAgenome',
|
|
6038 strand_identity => 'con ori reverse',
|
|
6039 bisulfiteIndex => $GA_index_basename,
|
|
6040 seen => 0,
|
|
6041 wrong_strand => 0,
|
|
6042 },
|
|
6043 { name => 'GAreadCTgenome',
|
|
6044 strand_identity => 'compl ori con forward',
|
|
6045 bisulfiteIndex => $CT_index_basename,
|
|
6046 seen => 0,
|
|
6047 wrong_strand => 0,
|
|
6048 },
|
|
6049 { name => 'GAreadGAgenome',
|
|
6050 strand_identity => 'compl ori con reverse',
|
|
6051 bisulfiteIndex => $GA_index_basename,
|
|
6052 seen => 0,
|
|
6053 wrong_strand => 0,
|
|
6054 },
|
|
6055 );
|
|
6056 }
|
|
6057 }
|
|
6058
|
|
6059
|
|
6060 sub process_command_line{
|
|
6061 my @bowtie_options;
|
|
6062 my $help;
|
|
6063 my $mates1;
|
|
6064 my $mates2;
|
|
6065 my $path_to_bowtie;
|
|
6066 my $fastq;
|
|
6067 my $fasta;
|
|
6068 my $skip;
|
|
6069 my $qupto;
|
|
6070 my $phred64;
|
|
6071 my $phred33;
|
|
6072 my $solexa;
|
|
6073 my $mismatches;
|
|
6074 my $seed_length;
|
|
6075 my $best;
|
|
6076 my $sequence_format;
|
|
6077 my $version;
|
|
6078 my $quiet;
|
|
6079 my $chunk;
|
|
6080 my $non_directional;
|
|
6081 my $ceiling;
|
|
6082 my $maxins;
|
|
6083 my $minins;
|
|
6084 my $unmapped;
|
|
6085 my $multi_map;
|
|
6086 my $output_dir;
|
|
6087 my $bowtie2;
|
|
6088 my $vanilla;
|
|
6089 my $sam_no_hd;
|
|
6090 my $seed_extension_fails;
|
|
6091 my $reseed_repetitive_seeds;
|
|
6092 my $most_valid_alignments;
|
|
6093 my $score_min;
|
|
6094 my $parallel;
|
|
6095 my $temp_dir;
|
|
6096 my $rdg;
|
|
6097 my $rfg;
|
|
6098 my $non_bs_mm;
|
|
6099 my $samtools_path;
|
|
6100 my $bam;
|
|
6101 my $gzip;
|
|
6102 my $pbat;
|
3
|
6103 my $prefix;
|
|
6104 my $old_flag;
|
0
|
6105
|
|
6106 my $command_line = GetOptions ('help|man' => \$help,
|
|
6107 '1=s' => \$mates1,
|
|
6108 '2=s' => \$mates2,
|
|
6109 'path_to_bowtie=s' => \$path_to_bowtie,
|
|
6110 'f|fasta' => \$fasta,
|
|
6111 'q|fastq' => \$fastq,
|
|
6112 's|skip=i' => \$skip,
|
|
6113 'u|upto=i' => \$qupto,
|
|
6114 'phred33-quals' => \$phred33,
|
|
6115 'phred64-quals|solexa1' => \$phred64,
|
|
6116 'solexa-quals' => \$solexa,
|
|
6117 'n|seedmms=i' => \$mismatches,
|
|
6118 'l|seedlen=i' => \$seed_length,
|
|
6119 'no_best' => \$best,
|
|
6120 'version' => \$version,
|
|
6121 'quiet' => \$quiet,
|
|
6122 'chunkmbs=i' => \$chunk,
|
|
6123 'non_directional' => \$non_directional,
|
|
6124 'I|minins=i' => \$minins,
|
|
6125 'X|maxins=i' => \$maxins,
|
|
6126 'e|maqerr=i' => \$ceiling,
|
|
6127 'un|unmapped' => \$unmapped,
|
|
6128 'ambiguous' => \$multi_map,
|
|
6129 'o|output_dir=s' => \$output_dir,
|
|
6130 'bowtie2' => \$bowtie2,
|
|
6131 'vanilla' => \$vanilla,
|
|
6132 'sam-no-hd' => \$sam_no_hd,
|
|
6133 'D=i' => \$seed_extension_fails,
|
|
6134 'R=i' => \$reseed_repetitive_seeds,
|
|
6135 'score_min=s' => \$score_min,
|
|
6136 'most_valid_alignments=i' => \$most_valid_alignments,
|
|
6137 'p=i' => \$parallel,
|
|
6138 'temp_dir=s' => \$temp_dir,
|
|
6139 'rdg=s' => \$rdg,
|
|
6140 'rfg=s' => \$rfg,
|
|
6141 'non_bs_mm' => \$non_bs_mm,
|
|
6142 'samtools_path=s' => \$samtools_path,
|
|
6143 'bam' => \$bam,
|
|
6144 'gzip' => \$gzip,
|
|
6145 'pbat' => \$pbat,
|
3
|
6146 'prefix=s' => \$prefix,
|
|
6147 'old_flag' => \$old_flag,
|
0
|
6148 );
|
|
6149
|
|
6150
|
|
6151 ### EXIT ON ERROR if there were errors with any of the supplied options
|
|
6152 unless ($command_line){
|
|
6153 die "Please respecify command line options\n";
|
|
6154 }
|
|
6155 ### HELPFILE
|
|
6156 if ($help){
|
|
6157 print_helpfile();
|
|
6158 exit;
|
|
6159 }
|
|
6160 if ($version){
|
|
6161 print << "VERSION";
|
|
6162
|
|
6163
|
|
6164 Bismark - Bisulfite Mapper and Methylation Caller.
|
|
6165
|
|
6166 Bismark Version: $bismark_version
|
|
6167 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
|
|
6168 www.bioinformatics.babraham.ac.uk/projects/
|
|
6169
|
|
6170
|
|
6171 VERSION
|
|
6172 exit;
|
|
6173 }
|
|
6174
|
|
6175
|
|
6176 ##########################
|
|
6177 ### PROCESSING OPTIONS ###
|
|
6178 ##########################
|
|
6179
|
|
6180 unless ($bowtie2){
|
|
6181 $bowtie2 = 0;
|
|
6182 }
|
|
6183 unless ($sam_no_hd){
|
|
6184 $sam_no_hd =0;
|
|
6185 }
|
|
6186
|
|
6187 ### PATH TO BOWTIE
|
|
6188 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
|
|
6189 if ($path_to_bowtie){
|
|
6190 unless ($path_to_bowtie =~ /\/$/){
|
|
6191 $path_to_bowtie =~ s/$/\//;
|
|
6192 }
|
|
6193 if (-d $path_to_bowtie){
|
|
6194 if ($bowtie2){
|
|
6195 $path_to_bowtie = "${path_to_bowtie}bowtie2";
|
|
6196 }
|
|
6197 else{
|
|
6198 $path_to_bowtie = "${path_to_bowtie}bowtie";
|
|
6199 }
|
|
6200 }
|
|
6201 else{
|
|
6202 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
|
|
6203 }
|
|
6204 }
|
|
6205 else{
|
|
6206 if ($bowtie2){
|
|
6207 $path_to_bowtie = 'bowtie2';
|
|
6208 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
|
|
6209 else{
|
|
6210 $path_to_bowtie = 'bowtie';
|
|
6211 warn "Path to Bowtie specified as: $path_to_bowtie\n";
|
|
6212 }
|
|
6213 }
|
|
6214
|
|
6215 ### OUTPUT REQUESTED AS BAM FILE
|
|
6216 if ($bam){
|
|
6217 if ($vanilla){
|
|
6218 die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n";
|
|
6219 }
|
|
6220
|
|
6221 ### PATH TO SAMTOOLS
|
|
6222 if (defined $samtools_path){
|
|
6223 # if Samtools was specified as full command
|
|
6224 if ($samtools_path =~ /samtools$/){
|
|
6225 if (-e $samtools_path){
|
|
6226 # Samtools executable found
|
|
6227 }
|
|
6228 else{
|
|
6229 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
|
|
6230 }
|
|
6231 }
|
|
6232 else{
|
|
6233 unless ($samtools_path =~ /\/$/){
|
|
6234 $samtools_path =~ s/$/\//;
|
|
6235 }
|
|
6236 $samtools_path .= 'samtools';
|
|
6237 if (-e $samtools_path){
|
|
6238 # Samtools executable found
|
|
6239 }
|
|
6240 else{
|
|
6241 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
|
|
6242 }
|
|
6243 }
|
|
6244
|
|
6245 warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n";
|
|
6246 $bam = 1;
|
|
6247 }
|
|
6248 # Check whether Samtools is in the PATH if no path was supplied by the user
|
|
6249 else{
|
|
6250 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH
|
|
6251 $samtools_path = `which samtools`;
|
|
6252 chomp $samtools_path;
|
|
6253 warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n";
|
|
6254 $bam = 1;
|
|
6255 }
|
|
6256 }
|
|
6257
|
|
6258 unless (defined $samtools_path){
|
|
6259 $bam = 2;
|
|
6260 warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n";
|
|
6261 }
|
|
6262 sleep (1);
|
|
6263 }
|
|
6264
|
|
6265
|
|
6266 ####################################
|
|
6267 ### PROCESSING ARGUMENTS
|
|
6268
|
|
6269 ### GENOME FOLDER
|
|
6270 my $genome_folder = shift @ARGV; # mandatory
|
|
6271 unless ($genome_folder){
|
|
6272 warn "Genome folder was not specified!\n";
|
|
6273 print_helpfile();
|
|
6274 exit;
|
|
6275 }
|
|
6276
|
|
6277 ### checking that the genome folder, all subfolders and the required bowtie index files exist
|
|
6278 unless ($genome_folder =~/\/$/){
|
|
6279 $genome_folder =~ s/$/\//;
|
|
6280 }
|
|
6281
|
|
6282 if (chdir $genome_folder){
|
|
6283 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
|
|
6284 unless ($absolute_genome_folder =~/\/$/){
|
|
6285 $absolute_genome_folder =~ s/$/\//;
|
|
6286 }
|
|
6287 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
|
|
6288 $genome_folder = $absolute_genome_folder;
|
|
6289 }
|
|
6290 else{
|
|
6291 die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
|
|
6292 }
|
|
6293
|
|
6294 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
|
|
6295 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
|
|
6296
|
|
6297 if ($bowtie2){ ### Bowtie 2 (new)
|
|
6298 ### checking the integrity of $CT_dir
|
|
6299 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
|
|
6300 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
|
|
6301 foreach my $file(@CT_bowtie_index){
|
|
6302 unless (-f $file){
|
3
|
6303 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n";
|
0
|
6304 }
|
|
6305 }
|
|
6306 ### checking the integrity of $GA_dir
|
|
6307 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
|
|
6308 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
|
|
6309 foreach my $file(@GA_bowtie_index){
|
|
6310 unless (-f $file){
|
3
|
6311 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n";
|
0
|
6312 }
|
|
6313 }
|
|
6314 }
|
|
6315
|
|
6316 else{ ### Bowtie 1 (default)
|
|
6317 ### checking the integrity of $CT_dir
|
|
6318 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
|
|
6319 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
|
|
6320 foreach my $file(@CT_bowtie_index){
|
|
6321 unless (-f $file){
|
3
|
6322 die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
|
0
|
6323 }
|
|
6324 }
|
|
6325 ### checking the integrity of $GA_dir
|
|
6326 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
|
|
6327 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
|
|
6328 foreach my $file(@GA_bowtie_index){
|
|
6329 unless (-f $file){
|
3
|
6330 die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
|
0
|
6331 }
|
|
6332 }
|
|
6333 }
|
|
6334
|
|
6335 my $CT_index_basename = "${CT_dir}BS_CT";
|
|
6336 my $GA_index_basename = "${GA_dir}BS_GA";
|
|
6337
|
|
6338 ### INPUT OPTIONS
|
|
6339
|
|
6340 ### SEQUENCE FILE FORMAT
|
|
6341 ### exits if both fastA and FastQ were specified
|
|
6342 if ($fasta and $fastq){
|
|
6343 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
|
|
6344 }
|
|
6345
|
|
6346 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
|
|
6347 if ($fasta){
|
|
6348 print "FastA format specified\n";
|
|
6349 $sequence_format = 'FASTA';
|
|
6350 push @bowtie_options, '-f';
|
|
6351 }
|
|
6352 elsif ($fastq){
|
|
6353 print "FastQ format specified\n";
|
|
6354 $sequence_format = 'FASTQ';
|
|
6355 push @bowtie_options, '-q';
|
|
6356 }
|
|
6357 else{
|
|
6358 $fastq = 1;
|
|
6359 print "FastQ format assumed (by default)\n";
|
|
6360 $sequence_format = 'FASTQ';
|
|
6361 push @bowtie_options, '-q';
|
|
6362 }
|
|
6363
|
|
6364 ### SKIP
|
|
6365 if ($skip){
|
|
6366 warn "Skipping the first $skip reads from the input file\n";
|
|
6367 # push @bowtie_options,"-s $skip";
|
|
6368 }
|
|
6369
|
|
6370 ### UPTO
|
|
6371 if ($qupto){
|
|
6372 warn "Processing sequences up to read no. $qupto from the input file\n";
|
|
6373 if ($bowtie2){
|
|
6374 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
|
|
6375 }
|
|
6376 else{
|
|
6377 # push @bowtie_options,"--qupto $qupto";
|
|
6378 }
|
|
6379 }
|
|
6380
|
|
6381 ### QUALITY VALUES
|
|
6382 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
|
|
6383 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
|
|
6384 }
|
|
6385 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
|
|
6386 # Phred quality values work only when -q is specified
|
|
6387 unless ($fastq){
|
|
6388 die "Phred quality values works only when -q (FASTQ) is specified\n";
|
|
6389 }
|
|
6390 if ($bowtie2){
|
|
6391 push @bowtie_options,"--phred33";
|
|
6392 }
|
|
6393 else{
|
|
6394 push @bowtie_options,"--phred33-quals";
|
|
6395 }
|
|
6396 }
|
|
6397 if ($phred64){
|
|
6398 # Phred quality values work only when -q is specified
|
|
6399 unless ($fastq){
|
|
6400 die "Phred quality values work only when -q (FASTQ) is specified\n";
|
|
6401 }
|
|
6402 if ($bowtie2){
|
|
6403 push @bowtie_options,"--phred64";
|
|
6404 }
|
|
6405 else{
|
|
6406 push @bowtie_options,"--phred64-quals";
|
|
6407 }
|
|
6408 }
|
|
6409 else{
|
|
6410 $phred64 = 0;
|
|
6411 }
|
|
6412
|
|
6413 if ($solexa){
|
|
6414 if ($bowtie2){
|
|
6415 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
|
|
6416 }
|
|
6417 # Solexa to Phred value conversion works only when -q is specified
|
|
6418 unless ($fastq){
|
|
6419 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
|
|
6420 }
|
|
6421 push @bowtie_options,"--solexa-quals";
|
|
6422 }
|
|
6423 else{
|
|
6424 $solexa = 0;
|
|
6425 }
|
|
6426
|
|
6427 ### ALIGNMENT OPTIONS
|
|
6428
|
|
6429 ### MISMATCHES
|
|
6430 if (defined $mismatches){
|
|
6431 if ($bowtie2){
|
|
6432 if ($mismatches == 0 or $mismatches == 1){
|
|
6433 push @bowtie_options,"-N $mismatches";
|
|
6434 }
|
|
6435 else{
|
|
6436 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
|
|
6437 }
|
|
6438 }
|
|
6439 else{
|
|
6440 if ($mismatches >= 0 and $mismatches <= 3){
|
|
6441 push @bowtie_options,"-n $mismatches";
|
|
6442 }
|
|
6443 else{
|
|
6444 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
|
|
6445 }
|
|
6446 }
|
|
6447 }
|
|
6448 else{
|
|
6449 unless ($bowtie2){
|
|
6450 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
|
|
6451 }
|
|
6452 }
|
|
6453
|
|
6454 ### SEED LENGTH
|
|
6455 if (defined $seed_length){
|
|
6456 if ($bowtie2){
|
|
6457 push @bowtie_options,"-L $seed_length";
|
|
6458 }
|
|
6459 else{
|
|
6460 push @bowtie_options,"-l $seed_length";
|
|
6461 }
|
|
6462 }
|
|
6463
|
|
6464 ### MISMATCH CEILING
|
|
6465 if (defined $ceiling){
|
|
6466 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
|
|
6467 push @bowtie_options,"-e $ceiling";
|
|
6468 }
|
|
6469
|
|
6470
|
|
6471 ### BOWTIE 2 EFFORT OPTIONS
|
|
6472
|
|
6473 ### CONSECUTIVE SEED EXTENSION FAILS
|
|
6474 if (defined $seed_extension_fails){
|
|
6475 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
|
|
6476 push @bowtie_options,"-D $seed_extension_fails";
|
|
6477 }
|
|
6478
|
|
6479 ### RE-SEEDING REPETITIVE SEEDS
|
|
6480 if (defined $reseed_repetitive_seeds){
|
|
6481 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
|
|
6482 push @bowtie_options,"-R $reseed_repetitive_seeds";
|
|
6483 }
|
|
6484
|
|
6485
|
|
6486 ### BOWTIE 2 SCORING OPTIONS
|
|
6487 if ($score_min){
|
|
6488 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
|
|
6489 unless ($score_min =~ /^L,.+,.+$/){
|
|
6490 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
|
|
6491 }
|
|
6492 push @bowtie_options,"--score-min $score_min";
|
|
6493 }
|
|
6494 else{
|
|
6495 if ($bowtie2){
|
|
6496 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
|
|
6497 }
|
|
6498 }
|
|
6499
|
|
6500 ### BOWTIE 2 READ GAP OPTIONS
|
|
6501 my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend);
|
|
6502
|
|
6503 if ($rdg){
|
|
6504 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
|
|
6505 if ($rdg =~ /^(\d+),(\d+)$/){
|
|
6506 $deletion_open = $1;
|
|
6507 $deletion_extend = $2;
|
|
6508 }
|
|
6509 else{
|
|
6510 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
|
|
6511 }
|
|
6512 push @bowtie_options,"--rdg $rdg";
|
|
6513 }
|
|
6514 else{
|
|
6515 $deletion_open = 5;
|
|
6516 $deletion_extend = 3;
|
|
6517 }
|
|
6518
|
|
6519 ### BOWTIE 2 REFERENCE GAP OPTIONS
|
|
6520 if ($rfg){
|
|
6521 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
|
|
6522 if ($rfg =~ /^(\d+),(\d+)$/){
|
|
6523 $insertion_open = $1;
|
|
6524 $insertion_extend = $2;
|
|
6525 }
|
|
6526 else{
|
|
6527 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
|
|
6528 }
|
|
6529 push @bowtie_options,"--rfg $rfg";
|
|
6530 }
|
|
6531 else{
|
|
6532 $insertion_open = 5;
|
|
6533 $insertion_extend = 3;
|
|
6534 }
|
|
6535
|
|
6536
|
|
6537 ### BOWTIE 2 PARALLELIZATION OPTIONS
|
|
6538 if (defined $parallel){
|
|
6539 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
|
|
6540 }
|
|
6541 if ($bowtie2){
|
|
6542 if ($parallel){
|
|
6543 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
|
|
6544 push @bowtie_options,"-p $parallel";
|
|
6545 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
|
|
6546 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
|
|
6547 sleep (2);
|
|
6548 }
|
|
6549 }
|
|
6550
|
|
6551 ### REPORTING OPTIONS
|
|
6552
|
|
6553 if ($bowtie2){
|
|
6554 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
|
|
6555
|
|
6556 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
|
|
6557 if(defined $most_valid_alignments){
|
|
6558
|
|
6559 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
|
|
6560 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
|
|
6561 }
|
|
6562 # else{
|
|
6563 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
|
|
6564 # }
|
|
6565 }
|
|
6566 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
|
|
6567 push @bowtie_options,'-k 2';
|
|
6568 }
|
|
6569
|
|
6570 ### --BEST
|
|
6571 if ($bowtie2){
|
|
6572 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
|
|
6573 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
|
|
6574 }
|
|
6575 }
|
|
6576 else{
|
|
6577 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
|
|
6578 unless ($best){
|
|
6579 push @bowtie_options,'--best';
|
|
6580 }
|
|
6581 }
|
|
6582
|
|
6583 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
|
|
6584 if ($vanilla){
|
|
6585 if ($bowtie2){
|
|
6586 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
|
|
6587 }
|
|
6588 }
|
|
6589 else{
|
|
6590 $vanilla = 0;
|
|
6591 }
|
|
6592
|
|
6593 ### PAIRED-END MAPPING
|
|
6594 if ($mates1){
|
|
6595 my @mates1 = (split (/,/,$mates1));
|
|
6596 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
|
|
6597 my @mates2 = (split(/,/,$mates2));
|
|
6598 unless (scalar @mates1 == scalar @mates2){
|
|
6599 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
|
|
6600 }
|
|
6601 while (1){
|
|
6602 my $mate1 = shift @mates1;
|
|
6603 my $mate2 = shift @mates2;
|
|
6604 last unless ($mate1 and $mate2);
|
|
6605 push @filenames,"$mate1,$mate2";
|
|
6606 }
|
|
6607 if ($bowtie2){
|
|
6608 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
|
|
6609 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
|
|
6610 }
|
3
|
6611
|
|
6612 if ($old_flag){
|
|
6613 warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla);
|
|
6614 sleep(3);
|
|
6615 }
|
0
|
6616 }
|
|
6617 elsif ($mates2){
|
|
6618 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
|
|
6619 }
|
|
6620
|
|
6621 ### SINGLE-END MAPPING
|
|
6622 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
|
|
6623 my $singles;
|
|
6624 unless ($mates1 and $mates2){
|
|
6625 $singles = join (',',@ARGV);
|
|
6626 unless ($singles){
|
|
6627 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
|
|
6628 }
|
|
6629 $singles =~ s/\s/,/g;
|
|
6630 @filenames = (split(/,/,$singles));
|
|
6631 warn "\nFiles to be analysed:\n";
|
|
6632 warn "@filenames\n\n";
|
|
6633 sleep (3);
|
|
6634 }
|
|
6635
|
|
6636 ### MININUM INSERT SIZE (PAIRED-END ONLY)
|
|
6637 if (defined $minins){
|
|
6638 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
|
|
6639 push @bowtie_options,"--minins $minins";
|
|
6640 }
|
|
6641
|
|
6642 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
|
|
6643 if (defined $maxins){
|
|
6644 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
|
|
6645 push @bowtie_options,"--maxins $maxins";
|
|
6646 }
|
|
6647 else{
|
|
6648 unless ($singles){
|
|
6649 push @bowtie_options,'--maxins 500';
|
|
6650 }
|
|
6651 }
|
|
6652
|
|
6653 ### QUIET prints nothing besides alignments (suppresses warnings)
|
|
6654 if ($quiet){
|
|
6655 push @bowtie_options,'--quiet';
|
|
6656 }
|
|
6657
|
|
6658 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
|
|
6659 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
|
|
6660 if (defined $chunk){
|
|
6661 push @bowtie_options,"--chunkmbs $chunk";
|
|
6662 }
|
|
6663 else{
|
|
6664 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
|
|
6665 }
|
|
6666 }
|
|
6667
|
|
6668
|
|
6669 ### SUMMARY OF ALL BOWTIE OPTIONS
|
|
6670 my $bowtie_options = join (' ',@bowtie_options);
|
|
6671
|
|
6672
|
|
6673 ### STRAND-SPECIFIC LIBRARIES
|
|
6674 my $directional;
|
|
6675 if ($non_directional){
|
|
6676 die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat);
|
|
6677 warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n";
|
|
6678 sleep (3);
|
|
6679 $directional = 0;
|
|
6680 }
|
|
6681 elsif($pbat){
|
|
6682 die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip);
|
|
6683 die "The option --pbat is currently not working for Bowtie 2. Please run alignments in default (i.e. Bowtie 1) mode!\n" if ($bowtie2);
|
|
6684 die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta);
|
|
6685
|
|
6686 warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n";
|
|
6687 sleep (3);
|
|
6688 $directional = 0;
|
|
6689 }
|
|
6690 else{
|
|
6691 warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n";
|
|
6692 sleep (3);
|
|
6693 $directional = 1; # default behaviour
|
|
6694 }
|
|
6695
|
|
6696 ### UNMAPPED SEQUENCE OUTPUT
|
|
6697 $unmapped = 0 unless ($unmapped);
|
|
6698
|
|
6699 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
|
|
6700 $multi_map = 0 unless ($multi_map);
|
|
6701
|
|
6702
|
|
6703 ### OUTPUT DIRECTORY
|
|
6704
|
|
6705 chdir $parent_dir or die "Failed to move back to current working directory\n";
|
|
6706 if ($output_dir){
|
|
6707 unless ($output_dir =~ /\/$/){
|
|
6708 $output_dir =~ s/$/\//;
|
|
6709 }
|
|
6710
|
|
6711 if (chdir $output_dir){
|
|
6712 $output_dir = getcwd; # making the path absolute
|
|
6713 unless ($output_dir =~ /\/$/){
|
|
6714 $output_dir =~ s/$/\//;
|
|
6715 }
|
|
6716 }
|
|
6717 else{
|
|
6718 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
|
|
6719 warn "Created output directory $output_dir!\n\n";
|
|
6720 chdir $output_dir or die "Failed to move to $output_dir\n";
|
|
6721 $output_dir = getcwd; # making the path absolute
|
|
6722 unless ($output_dir =~ /\/$/){
|
|
6723 $output_dir =~ s/$/\//;
|
|
6724 }
|
|
6725 }
|
|
6726 warn "Output will be written into the directory: $output_dir\n";
|
|
6727 }
|
|
6728 else{
|
|
6729 $output_dir = '';
|
|
6730 }
|
|
6731
|
|
6732 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
|
|
6733
|
|
6734 chdir $parent_dir or die "Failed to move back to current working directory\n";
|
|
6735 if ($temp_dir){
|
|
6736 warn "\nUsing temp directory: $temp_dir\n";
|
|
6737 unless ($temp_dir =~ /\/$/){
|
|
6738 $temp_dir =~ s/$/\//;
|
|
6739 }
|
|
6740
|
|
6741 if (chdir $temp_dir){
|
|
6742 $temp_dir = getcwd; # making the path absolute
|
|
6743 unless ($temp_dir =~ /\/$/){
|
|
6744 $temp_dir =~ s/$/\//;
|
|
6745 }
|
|
6746 }
|
|
6747 else{
|
|
6748 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
|
|
6749 warn "Created temporary directory $temp_dir!\n\n";
|
|
6750 chdir $temp_dir or die "Failed to move to $temp_dir\n";
|
|
6751 $temp_dir = getcwd; # making the path absolute
|
|
6752 unless ($temp_dir =~ /\/$/){
|
|
6753 $temp_dir =~ s/$/\//;
|
|
6754 }
|
|
6755 }
|
|
6756 warn "Temporary files will be written into the directory: $temp_dir\n";
|
|
6757 }
|
|
6758 else{
|
|
6759 $temp_dir = '';
|
|
6760 }
|
|
6761
|
|
6762 ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE
|
|
6763 if ($non_bs_mm){
|
|
6764 if ($vanilla){
|
|
6765 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n";
|
|
6766 }
|
|
6767 }
|
|
6768
|
3
|
6769 ### PREFIX FOR OUTPUT FILES
|
|
6770 if ($prefix){
|
|
6771 # removing trailing dots
|
|
6772
|
|
6773 $prefix =~ s/\.+$//;
|
|
6774
|
|
6775 warn "Using the following prefix for output files: $prefix\n\n";
|
|
6776 sleep(1);
|
|
6777 }
|
|
6778
|
|
6779
|
|
6780 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag);
|
0
|
6781 }
|
|
6782
|
|
6783
|
|
6784
|
|
6785 sub generate_SAM_header{
|
|
6786 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
|
|
6787 foreach my $chr (keys %chromosomes){
|
|
6788 my $length = length ($chromosomes{$chr});
|
|
6789 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
|
|
6790 }
|
|
6791 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
|
|
6792 }
|
|
6793
|
|
6794 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
|
|
6795 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
|
|
6796
|
|
6797 sub single_end_SAM_output{
|
|
6798 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
|
|
6799 my $strand = $methylation_call_params->{$id}->{alignment_strand};
|
|
6800 my $chr = $methylation_call_params->{$id}->{chromosome};
|
|
6801 my $start = $methylation_call_params->{$id}->{position};
|
|
6802 my $stop = $methylation_call_params->{$id}->{end_position};
|
|
6803 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
|
|
6804 my $methcall = $methylation_call_params->{$id}->{methylation_call};
|
|
6805 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
|
|
6806 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
|
|
6807 my $number_of_mismatches;
|
|
6808 if ($bowtie2){
|
|
6809 $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score};
|
|
6810 }
|
|
6811 else{
|
|
6812 $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches};
|
|
6813 }
|
|
6814
|
|
6815 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
|
|
6816 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
|
|
6817 ## Bit Description Comment Value
|
|
6818 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
|
|
6819 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
|
|
6820 ## 0x4 segment unmapped --- ---
|
|
6821 ## 0x8 next segment in the template unmapped --- ---
|
|
6822 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
|
|
6823 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
|
|
6824 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
|
|
6825 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
|
|
6826 ## 0x100 secondary alignment --- ---
|
|
6827 ## 0x200 not passing quality controls --- ---
|
|
6828 ## 0x400 PCR or optical duplicate --- ---
|
|
6829
|
|
6830 #####
|
|
6831
|
|
6832 my $flag; # FLAG variable used for SAM format.
|
|
6833 if ($strand eq "+"){
|
|
6834 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
|
|
6835 $flag = 0; # 0 for "+" strand (OT)
|
|
6836 }
|
|
6837 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
|
|
6838 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
|
|
6839 }
|
|
6840 else{
|
|
6841 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
|
|
6842 }
|
|
6843 }
|
|
6844 elsif ($strand eq "-"){
|
|
6845 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
|
|
6846 $flag = 16; # 16 for "-" strand (OB)
|
|
6847 }
|
|
6848 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
|
|
6849 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
|
|
6850 }
|
|
6851 else{
|
|
6852 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
|
|
6853 }
|
|
6854 }
|
|
6855 else{
|
|
6856 die "Unexpected strand information: $strand\n\n";
|
|
6857 }
|
|
6858
|
|
6859 #####
|
|
6860
|
|
6861 my $mapq = 255; # Assume mapping quality is unavailable
|
|
6862
|
|
6863 #####
|
|
6864
|
|
6865 my $cigar;
|
|
6866 if ($bowtie2){
|
|
6867 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
|
|
6868 }
|
|
6869 else{
|
|
6870 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
|
|
6871 }
|
|
6872
|
|
6873 #####
|
|
6874
|
|
6875 my $rnext = "*"; # Paired-end variable
|
|
6876
|
|
6877 #####
|
|
6878
|
|
6879 my $pnext = 0; # Paired-end variable
|
|
6880
|
|
6881 #####
|
|
6882
|
|
6883 my $tlen = 0; # Paired-end variable
|
|
6884
|
|
6885 #####
|
|
6886
|
|
6887 if ($read_conversion eq 'CT'){
|
|
6888 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
|
|
6889 }
|
|
6890 else{
|
|
6891 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
|
|
6892 }
|
|
6893
|
|
6894 if ($strand eq '-'){
|
|
6895 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
|
|
6896 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
|
|
6897 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
|
|
6898 }
|
|
6899
|
|
6900 #####
|
|
6901
|
|
6902 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
|
|
6903 # into the reference string. hemming_dist()
|
|
6904 if ($bowtie2){
|
|
6905 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
|
|
6906 }
|
|
6907
|
|
6908 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
|
|
6909
|
|
6910 #####
|
|
6911
|
|
6912 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
|
|
6913
|
|
6914 #####
|
|
6915
|
|
6916 my $XM_tag; # Optional tag XM: Methylation Call String
|
|
6917 if ($strand eq '+'){
|
|
6918 $XM_tag = "XM:Z:$methcall";
|
|
6919 }
|
|
6920 elsif ($strand eq '-'){
|
|
6921 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
|
|
6922 }
|
|
6923
|
|
6924 #####
|
|
6925
|
|
6926 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
|
|
6927
|
|
6928 #####
|
|
6929
|
|
6930 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
|
|
6931
|
|
6932 #####
|
|
6933
|
|
6934 # Optionally calculating number of mismatches for Bowtie 2 alignments
|
|
6935
|
|
6936 if ($non_bs_mm) {
|
|
6937 if ($bowtie2) {
|
|
6938
|
|
6939 $number_of_mismatches =~ s/-//; # removing the minus sign
|
|
6940
|
|
6941 ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches
|
|
6942 if ($cigar =~ /(D|I)/) {
|
|
6943 # warn "$cigar\n";
|
|
6944
|
|
6945 # parsing CIGAR string
|
|
6946 my @len = split (/\D+/,$cigar); # storing the length per operation
|
|
6947 my @ops = split (/\d+/,$cigar); # storing the operation
|
|
6948 shift @ops; # remove the empty first element
|
|
6949 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
6950
|
|
6951 foreach (0..$#len) {
|
|
6952 if ($ops[$_] eq 'M') {
|
|
6953 # warn "skipping\n";
|
|
6954 next; # irrelevant
|
|
6955 }
|
|
6956 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
|
|
6957 $number_of_mismatches -= $insertion_open;
|
|
6958 $number_of_mismatches -= $len[$_] * $insertion_extend;
|
|
6959 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
|
|
6960 }
|
|
6961 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
|
|
6962 $number_of_mismatches -= $deletion_open;
|
|
6963 $number_of_mismatches -= $len[$_] * $deletion_extend;
|
|
6964 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
|
|
6965 }
|
|
6966 elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
|
|
6967 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
|
|
6968 }
|
|
6969 else {
|
|
6970 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
|
|
6971 }
|
|
6972 }
|
|
6973 # warn "Alignment score $number_of_mismatches\n";
|
|
6974 # print "Mismatches $number_of_mismatches\n\n";
|
|
6975 }
|
|
6976 ### Now we have InDel corrected alignment scores
|
|
6977
|
|
6978 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
|
|
6979 ### sequence contained more than 5 Ns, but this should occur close to never
|
|
6980
|
|
6981 my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division
|
|
6982 # warn "N count: $seq_N_count\n";
|
|
6983 $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count;
|
|
6984 # warn "MM $number_of_mismatches\n";
|
|
6985 }
|
|
6986 }
|
|
6987
|
|
6988 ####
|
|
6989
|
|
6990 my $XA_tag = "XA:Z:$number_of_mismatches";
|
|
6991
|
|
6992 #####
|
|
6993
|
|
6994 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
|
|
6995 ### optionally print number of non-bisulfite mismatches
|
|
6996 if ($non_bs_mm){
|
|
6997 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n";
|
|
6998 }
|
|
6999 else{ # default
|
|
7000 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
|
|
7001 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
|
|
7002 }
|
|
7003 }
|
|
7004
|
|
7005 sub paired_end_SAM_output{
|
|
7006 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
|
|
7007 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
|
|
7008 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
|
|
7009 my $chr = $methylation_call_params->{$id}->{chromosome};
|
|
7010 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
|
|
7011 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
|
|
7012 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
|
|
7013 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
|
|
7014 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
|
|
7015 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
|
|
7016 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
|
|
7017
|
3
|
7018 my $id_1;
|
|
7019 my $id_2;
|
|
7020
|
|
7021 if ($old_flag){
|
|
7022 $id_1 = $id.'/1';
|
|
7023 $id_2 = $id.'/2';
|
|
7024 }
|
|
7025 else{
|
|
7026 $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard
|
|
7027 $id_2 = $id;
|
|
7028 }
|
0
|
7029
|
|
7030 # Allows all degenerate nucleotide sequences in reference genome
|
|
7031 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
|
|
7032 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
|
|
7033
|
|
7034 my $index; # used to store the srand origin of the alignment in a less convoluted way
|
|
7035
|
|
7036 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
|
|
7037 $index = 0; ## this is OT (original top strand)
|
|
7038 }
|
|
7039 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
|
|
7040 $index = 1; ## this is CTOB (complementary to OB)
|
|
7041 }
|
|
7042 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
|
|
7043 $index = 2; ## this is CTOT (complementary to OT)
|
|
7044 }
|
|
7045 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
|
|
7046 $index = 3; ## this is OB (original bottom)
|
|
7047 }
|
|
7048 else {
|
|
7049 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
|
|
7050 }
|
|
7051
|
|
7052 my $number_of_mismatches_1;
|
|
7053 my $number_of_mismatches_2;
|
|
7054
|
|
7055 if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine
|
|
7056 $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default!
|
|
7057 $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2};
|
|
7058 }
|
|
7059 else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation
|
|
7060 if ($index == 2 or $index == 3){ # CTOT or OB
|
|
7061 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default!
|
|
7062 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1};
|
|
7063 }
|
|
7064 else{ # if the first read aligned in forward direction it is like for Bowtie 2
|
|
7065 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
|
|
7066 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
|
|
7067 }
|
|
7068 }
|
|
7069
|
|
7070
|
|
7071
|
|
7072 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
|
|
7073 ### first or last position.
|
|
7074
|
|
7075 if ($index == 0 or $index == 3){ # OT or OB
|
|
7076 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
|
|
7077 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
|
|
7078 }
|
|
7079 else{ # CTOT or CTOB
|
|
7080 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
|
|
7081 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
|
|
7082 }
|
|
7083
|
|
7084 #####
|
|
7085
|
|
7086 my $start_read_1;
|
|
7087 my $start_read_2;
|
|
7088 # adjusting end positions
|
|
7089
|
|
7090 if ($bowtie2){
|
|
7091 $start_read_1 = $methylation_call_params->{$id}->{position_1};
|
|
7092 $start_read_2 = $methylation_call_params->{$id}->{position_2};
|
|
7093 }
|
|
7094 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
|
|
7095 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
|
|
7096 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
|
|
7097 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
|
|
7098 }
|
|
7099 else{ # read 1 is on the - strand
|
|
7100 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
|
|
7101 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
|
|
7102 }
|
|
7103 }
|
|
7104
|
|
7105 #####
|
|
7106
|
|
7107 my $end_read_1;
|
|
7108 my $end_read_2;
|
|
7109 # adjusting end positions
|
|
7110
|
|
7111 if ($bowtie2){
|
|
7112 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
|
|
7113 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
|
|
7114 }
|
|
7115 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
|
|
7116 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
|
|
7117 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
|
|
7118 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
|
|
7119 }
|
|
7120 else{
|
|
7121 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
|
|
7122 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
|
|
7123 }
|
|
7124 }
|
|
7125
|
|
7126 #####
|
|
7127
|
|
7128 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
|
|
7129 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
|
|
7130 ## Bit Description Comment Value
|
|
7131 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
|
|
7132 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
|
|
7133 ## 0x4 segment unmapped --- ---
|
|
7134 ## 0x8 next segment in the template unmapped --- ---
|
|
7135 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
|
|
7136 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
|
|
7137 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
|
|
7138 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
|
|
7139 ## 0x100 secondary alignment --- ---
|
|
7140 ## 0x200 not passing quality controls --- ---
|
|
7141 ## 0x400 PCR or optical duplicate --- ---
|
|
7142
|
|
7143 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
|
|
7144
|
|
7145 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
|
|
7146 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
|
|
7147
|
3
|
7148 my $flag_1; # FLAG variable used for SAM format
|
0
|
7149 my $flag_2;
|
|
7150
|
3
|
7151 ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH)
|
|
7152
|
0
|
7153 if ($index == 0){ # OT
|
3
|
7154 unless ($old_flag){
|
|
7155 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
|
|
7156 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
|
|
7157 }
|
|
7158 else{
|
|
7159 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
|
|
7160 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
|
|
7161 }
|
0
|
7162 }
|
|
7163 elsif ($index == 1){ # CTOB
|
3
|
7164 unless($old_flag){
|
|
7165 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
|
|
7166 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
|
|
7167 }
|
|
7168 else{
|
|
7169 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
|
|
7170 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
|
|
7171 }
|
0
|
7172 }
|
|
7173 elsif ($index == 2){ # CTOT
|
3
|
7174 unless ($old_flag){
|
|
7175 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
|
|
7176 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
|
|
7177 }
|
|
7178 else{
|
|
7179 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
|
|
7180 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
|
|
7181 }
|
0
|
7182 }
|
|
7183 elsif ($index == 3){ # OB
|
3
|
7184 unless ($old_flag){
|
|
7185 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
|
|
7186 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
|
|
7187 }
|
|
7188 else{
|
|
7189 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
|
|
7190 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
|
|
7191 }
|
0
|
7192 }
|
|
7193
|
|
7194 #####
|
|
7195
|
|
7196 my $mapq = 255; # Mapping quality is unavailable
|
|
7197
|
|
7198 #####
|
|
7199
|
|
7200 my $cigar_1;
|
|
7201 my $cigar_2;
|
|
7202
|
|
7203 if ($bowtie2){
|
|
7204 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
|
|
7205 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
|
|
7206 }
|
|
7207 else{
|
|
7208 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
|
|
7209 $cigar_2 = length($actual_seq_2) . "M";
|
|
7210 }
|
|
7211
|
|
7212 #####
|
|
7213
|
|
7214 my $rnext = '='; # Chromosome of mate; applies to both reads
|
|
7215
|
|
7216 #####
|
|
7217
|
|
7218 my $pnext_1 = $start_read_2; # Leftmost position of mate
|
|
7219 my $pnext_2 = $start_read_1;
|
|
7220
|
|
7221 #####
|
|
7222
|
|
7223 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
|
|
7224 my $tlen_2;
|
|
7225
|
|
7226 if ($bowtie2){
|
|
7227
|
|
7228 if ($start_read_1 <= $start_read_2){
|
|
7229
|
|
7230 # Read 1 alignment is leftmost
|
|
7231
|
|
7232 if ($end_read_2 >= $end_read_1){
|
|
7233
|
|
7234 # -------------------------> read 1 reads overlapping
|
|
7235 # <------------------------- read 2
|
|
7236 #
|
|
7237 # or
|
|
7238 #
|
|
7239 # -------------------------> read 1
|
|
7240 # <----------------------- read 2 read 2 contained within read 1
|
|
7241 #
|
|
7242 # or
|
|
7243 #
|
|
7244 # -------------------------> read 1 reads 1 and 2 exactly overlapping
|
|
7245 # <------------------------- read 2
|
|
7246 #
|
|
7247
|
|
7248 # dovetailing of reads is not enabled for Bowtie 2 alignments
|
|
7249
|
|
7250 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
|
|
7251 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
|
|
7252 }
|
|
7253 elsif ($end_read_2 < $end_read_1){
|
|
7254
|
|
7255 # -------------------------> read 1
|
|
7256 # <----------- read 2 read 2 contained within read 1
|
|
7257 #
|
|
7258 # or
|
|
7259 #
|
|
7260 # -------------------------> read 1
|
3
|
7261 # <------------------------ read 2 read 2 contained within read 1
|
|
7262
|
|
7263 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable
|
|
7264 $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign,
|
|
7265 $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this
|
|
7266 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
|
0
|
7267 }
|
|
7268
|
|
7269 }
|
|
7270
|
|
7271 elsif ($start_read_2 < $start_read_1){
|
|
7272
|
|
7273 if ($end_read_1 >= $end_read_2){
|
|
7274
|
|
7275 # Read 2 alignment is leftmost
|
|
7276
|
|
7277 # -------------------------> read 2 reads overlapping
|
|
7278 # <------------------------- read 1
|
|
7279 #
|
|
7280 # or
|
|
7281 #
|
|
7282 # -------------------------> read 2
|
|
7283 # <----------------------- read 1 read 1 contained within read 2
|
|
7284 #
|
|
7285 #
|
|
7286
|
|
7287 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
|
|
7288 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
|
|
7289 }
|
|
7290 elsif ($end_read_1 < $end_read_2){
|
|
7291
|
|
7292 # -------------------------> read 2
|
|
7293 # <----------- read 1 read 1 contained within read 2
|
|
7294 #
|
|
7295 # or
|
|
7296 #
|
|
7297 # -------------------------> read 2
|
3
|
7298 # <------------------------ read 1 read 1 contained within read 2
|
0
|
7299
|
3
|
7300 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable
|
|
7301 $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign,
|
|
7302 $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this
|
|
7303 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
|
|
7304 }
|
0
|
7305 }
|
|
7306 }
|
|
7307
|
|
7308 else{ # Bowtie 1
|
|
7309
|
|
7310 if ($end_read_2 >= $end_read_1){
|
|
7311 # Read 1 alignment is leftmost
|
|
7312 # -------------------------> read 1
|
|
7313 # <------------------------- read 2
|
|
7314 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
|
|
7315
|
|
7316 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
|
|
7317 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
|
|
7318 }
|
|
7319 else{
|
|
7320 # Read 2 alignment is leftmost
|
|
7321 # -------------------------> read 2
|
|
7322 # <------------------------- read 1
|
|
7323 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
|
|
7324
|
|
7325 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
|
|
7326 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
|
|
7327 }
|
|
7328 }
|
|
7329
|
|
7330 #####
|
|
7331
|
|
7332 # adjusting the strand of the sequence before we use them to generate mismatch strings
|
|
7333 if ($strand_1 eq '-'){
|
|
7334 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
|
|
7335 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
|
|
7336 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
|
|
7337 }
|
|
7338 if ($strand_2 eq '-'){
|
|
7339 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
|
|
7340 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
|
|
7341 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
|
|
7342 }
|
|
7343
|
|
7344 # print "$actual_seq_1\n$ref_seq_1\n\n";
|
|
7345 # print "$actual_seq_2\n$ref_seq_2\n\n";
|
|
7346
|
|
7347 #####
|
|
7348
|
|
7349 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
|
|
7350 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
|
|
7351 if ($bowtie2){
|
|
7352 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
|
|
7353 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
|
|
7354 }
|
|
7355 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
|
|
7356 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
|
|
7357
|
|
7358 #####
|
|
7359
|
|
7360 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
|
|
7361 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
|
|
7362
|
|
7363 #####
|
|
7364
|
|
7365 my $XM_tag_1; # Optional tag XM: Methylation call string
|
|
7366 my $XM_tag_2;
|
|
7367
|
|
7368 if ($strand_1 eq '-'){
|
|
7369 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
|
|
7370 }
|
|
7371 else{
|
|
7372 $XM_tag_1 = "XM:Z:$methcall_1";
|
|
7373 }
|
|
7374
|
|
7375 if ($strand_2 eq '-'){
|
|
7376 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
|
|
7377 }
|
|
7378 else{
|
|
7379 $XM_tag_2 = "XM:Z:$methcall_2";
|
|
7380 }
|
|
7381
|
|
7382 #####
|
|
7383
|
|
7384 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
|
|
7385 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
|
|
7386
|
|
7387 #####
|
|
7388
|
|
7389 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
|
|
7390
|
|
7391 #####
|
|
7392
|
|
7393 # Optionally calculating number of mismatches for Bowtie 2 alignments
|
|
7394
|
|
7395 if ($non_bs_mm) {
|
|
7396 if ($bowtie2) {
|
|
7397
|
|
7398 $number_of_mismatches_1 =~ s/-//; # removing the minus sign
|
|
7399 $number_of_mismatches_2 =~ s/-//;
|
|
7400
|
|
7401 ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches
|
|
7402
|
|
7403 ### CIGAR 1
|
|
7404 if ($cigar_1 =~ /(D|I)/) {
|
|
7405 # warn "$cigar_1\n";
|
|
7406
|
|
7407 # parsing CIGAR string
|
|
7408 my @len = split (/\D+/,$cigar_1); # storing the length per operation
|
|
7409 my @ops = split (/\d+/,$cigar_1); # storing the operation
|
|
7410 shift @ops; # remove the empty first element
|
|
7411 die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
7412
|
|
7413 foreach (0..$#len) {
|
|
7414 if ($ops[$_] eq 'M') {
|
|
7415 # warn "skipping\n";
|
|
7416 next; # irrelevant
|
|
7417 }
|
|
7418 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
|
|
7419 $number_of_mismatches_1 -= $insertion_open;
|
|
7420 $number_of_mismatches_1 -= $len[$_] * $insertion_extend;
|
|
7421 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
|
|
7422 }
|
|
7423 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
|
|
7424 $number_of_mismatches_1 -= $deletion_open;
|
|
7425 $number_of_mismatches_1 -= $len[$_] * $deletion_extend;
|
|
7426 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
|
|
7427 }
|
|
7428 elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
|
|
7429 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
|
|
7430 }
|
|
7431 else {
|
|
7432 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
|
|
7433 }
|
|
7434 }
|
|
7435
|
|
7436 # warn "Alignment score $number_of_mismatches_1\n";
|
|
7437 # print "Mismatches $number_of_mismatches_1\n\n";
|
|
7438 }
|
|
7439
|
|
7440 ### CIGAR 2
|
|
7441 if ($cigar_2 =~ /(D|I)/) {
|
|
7442 # warn "$cigar_2\n";
|
|
7443
|
|
7444 # parsing CIGAR string
|
|
7445 my @len = split (/\D+/,$cigar_2); # storing the length per operation
|
|
7446 my @ops = split (/\d+/,$cigar_2); # storing the operation
|
|
7447 shift @ops; # remove the empty first element
|
|
7448 die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
|
|
7449
|
|
7450 foreach (0..$#len) {
|
|
7451 if ($ops[$_] eq 'M') {
|
|
7452 # warn "skipping\n";
|
|
7453 next; #irrelevant
|
|
7454 }
|
|
7455 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
|
|
7456 $number_of_mismatches_2 -= $insertion_open;
|
|
7457 $number_of_mismatches_2 -= $len[$_] * $insertion_extend;
|
|
7458 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
|
|
7459 }
|
|
7460 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
|
|
7461 $number_of_mismatches_2 -= $deletion_open;
|
|
7462 $number_of_mismatches_2 -= $len[$_] * $deletion_extend;
|
|
7463 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
|
|
7464 }
|
|
7465 elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
|
|
7466 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
|
|
7467 }
|
|
7468 else {
|
|
7469 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
|
|
7470 }
|
|
7471 }
|
|
7472 }
|
|
7473
|
|
7474 ### Now we have InDel corrected Alignment scores
|
|
7475
|
|
7476 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
|
|
7477 ### sequence contained more than 5 Ns, but this should occur close to never
|
|
7478
|
|
7479 my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division
|
|
7480 my $seq_2_N_count = $number_of_mismatches_2 % 6;
|
|
7481 # warn "N count 1: $seq_1_N_count\n";
|
|
7482 # warn "N count 2: $seq_2_N_count\n";
|
|
7483
|
|
7484 $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count;
|
|
7485 $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count;
|
|
7486
|
|
7487 # warn "MM1 $number_of_mismatches_1 \n";
|
|
7488 # warn "MM2 $number_of_mismatches_2 \n";
|
|
7489 }
|
|
7490 }
|
|
7491
|
|
7492 ####
|
|
7493
|
|
7494 my $XA_tag = "XA:Z:$number_of_mismatches_1";
|
|
7495 my $XB_tag = "XB:Z:$number_of_mismatches_2";
|
|
7496
|
|
7497
|
|
7498 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
|
|
7499 ### optionally print number of non-bisulfite mismatches
|
|
7500 if ($non_bs_mm){
|
|
7501 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n";
|
|
7502 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n";
|
|
7503 }
|
|
7504 else{ # default
|
|
7505 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
|
|
7506 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
|
|
7507 }
|
|
7508 }
|
|
7509
|
|
7510 sub revcomp{
|
|
7511 my $seq = shift or die "Missing seq to reverse complement\n";
|
|
7512 $seq = reverse $seq;
|
|
7513 $seq =~ tr/ACTGactg/TGACTGAC/;
|
|
7514 return $seq;
|
|
7515 }
|
|
7516
|
|
7517 sub hemming_dist{
|
|
7518 my $matches = 0;
|
|
7519 my @actual_seq = split //,(shift @_);
|
|
7520 my @ref_seq = split //,(shift @_);
|
|
7521 foreach (0..$#actual_seq){
|
|
7522 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
|
|
7523 }
|
|
7524 return my $hd = scalar @actual_seq - $matches;
|
|
7525 }
|
|
7526
|
|
7527 sub make_mismatch_string{
|
|
7528 my $actual_seq = shift or die "Missing actual sequence";
|
|
7529 my $ref_seq = shift or die "Missing reference sequence";
|
|
7530 my $XX_tag = "XX:Z:";
|
|
7531 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
|
|
7532 my $prev_mm_pos = 0;
|
|
7533 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
|
|
7534 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
|
|
7535 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
|
|
7536 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
|
|
7537 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
|
|
7538 $prev_mm_pos = pos($tmp); # Position of last mismatch
|
|
7539 }
|
|
7540 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
|
|
7541 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
|
|
7542 return $XX_tag;
|
|
7543 }
|
|
7544
|
|
7545
|
|
7546
|
|
7547 sub print_helpfile{
|
|
7548 print << "HOW_TO";
|
|
7549
|
|
7550
|
|
7551 This program is free software: you can redistribute it and/or modify
|
|
7552 it under the terms of the GNU General Public License as published by
|
|
7553 the Free Software Foundation, either version 3 of the License, or
|
|
7554 (at your option) any later version.
|
|
7555
|
|
7556 This program is distributed in the hope that it will be useful,
|
|
7557 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
7558 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
7559 GNU General Public License for more details.
|
|
7560 You should have received a copy of the GNU General Public License
|
|
7561 along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
7562
|
|
7563
|
|
7564
|
|
7565 DESCRIPTION
|
|
7566
|
|
7567
|
|
7568 The following is a brief description of command line options and arguments to control the Bismark
|
|
7569 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
|
|
7570 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
|
|
7571 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
|
|
7572 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
|
|
7573 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
|
|
7574 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
|
|
7575 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
|
|
7576 sequence from the genome and determine if there were any protected C's present or not.
|
|
7577
|
|
7578 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
|
|
7579 re-enabled by using --non_directional.
|
|
7580
|
|
7581 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
|
|
7582 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
|
|
7583 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
|
|
7584
|
|
7585
|
|
7586 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
|
|
7587
|
|
7588
|
|
7589 ARGUMENTS:
|
|
7590
|
|
7591 <genome_folder> The path to the folder containing the unmodified reference genome
|
|
7592 as well as the subfolders created by the Bismark_Genome_Preparation
|
|
7593 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
|
|
7594 Bismark expects one or more fastA files in this folder (file extension: .fa
|
|
7595 or .fasta). The path can be relative or absolute.
|
|
7596
|
|
7597 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
|
|
7598 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
|
|
7599 correspond file-for-file and read-for-read with those specified in <mates2>.
|
|
7600 Reads may be a mix of different lengths. Bismark will produce one mapping result
|
|
7601 and one report file per paired-end input file pair.
|
|
7602
|
|
7603 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
|
|
7604 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
|
|
7605 correspond file-for-file and read-for-read with those specified in <mates1>.
|
|
7606 Reads may be a mix of different lengths.
|
|
7607
|
|
7608 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
|
|
7609 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
|
|
7610 produce one mapping result and one report file per input file.
|
|
7611
|
|
7612
|
|
7613 OPTIONS:
|
|
7614
|
|
7615
|
|
7616 Input:
|
|
7617
|
|
7618 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
|
|
7619 files (usually having extension .fg or .fastq). This is the default. See also
|
|
7620 --solexa-quals.
|
|
7621
|
|
7622 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
|
|
7623 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
|
|
7624 are assumed to be 40 on the Phred scale. FASTA files are expected to contain both
|
|
7625 the read name and the sequence on a single line (and not spread over several lines).
|
|
7626
|
|
7627 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
|
|
7628
|
|
7629 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
|
|
7630
|
|
7631 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
|
|
7632
|
|
7633 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
|
|
7634
|
|
7635 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
|
|
7636 (which can't). The formula for conversion is:
|
|
7637 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
|
|
7638 is usually the right option for use with (unconverted) reads emitted by the GA
|
|
7639 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
|
|
7640
|
|
7641 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
|
|
7642 reads emitted by GA Pipeline version 1.3 or later. Default: off.
|
|
7643
|
|
7644 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
|
|
7645 specified it is assumed that Bowtie (1 or 2) is in the PATH.
|
|
7646
|
|
7647
|
|
7648 Alignment:
|
|
7649
|
|
7650 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
|
|
7651 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
|
|
7652 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
|
|
7653
|
|
7654 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
|
|
7655 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
|
|
7656 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
|
|
7657
|
|
7658 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
|
|
7659 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
|
|
7660 quality values to the nearest 10 and saturates at 30. This value is not relevant for
|
|
7661 Bowtie 2.
|
|
7662
|
|
7663 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
|
|
7664 --best mode. Best-first search must keep track of many paths at once to ensure it is
|
|
7665 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
|
|
7666 memory impact of the descriptors, but they can still grow very large in some cases. If
|
|
7667 you receive an error message saying that chunk memory has been exhausted in --best mode,
|
|
7668 try adjusting this parameter up to dedicate more memory to the descriptors. This value
|
|
7669 is not relevant for Bowtie 2. Default: 512.
|
|
7670
|
|
7671 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
|
|
7672 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
|
|
7673 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
|
|
7674 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
|
|
7675
|
|
7676 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
|
|
7677 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
|
|
7678 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
|
|
7679 A 61-bp gap would not be valid in that case. Default: 500.
|
|
7680
|
|
7681
|
|
7682 Bowtie 1 Reporting:
|
|
7683
|
|
7684 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
|
|
7685 will be used by default.
|
|
7686
|
|
7687 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
|
|
7688 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
|
|
7689 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
|
|
7690 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
|
|
7691 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
|
|
7692 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
|
|
7693 the best alignment). --best mode also removes all strand bias. Note that --best does not
|
|
7694 affect which alignments are considered "valid" by Bowtie, only which valid alignments
|
|
7695 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
|
|
7696 Default: on.
|
|
7697
|
|
7698 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
|
|
7699 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
|
|
7700
|
|
7701
|
|
7702 Output:
|
|
7703
|
|
7704 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
|
|
7705 bisulfite strands will be reported. Default: OFF.
|
|
7706
|
|
7707 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
|
|
7708 to the original strands are merely theoretical and should not exist in reality. Specifying directional
|
|
7709 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
|
|
7710 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
|
|
7711 for sprand-specific libraries).
|
|
7712
|
|
7713 --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al.,
|
|
7714 PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode,
|
|
7715 as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT
|
|
7716 and OB ones. Use this option only if you are certain that your libraries were constructed following
|
|
7717 a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option
|
|
7718 --pbat works only for single-end and paired-end FastQ files for use with Bowtie1 (uncompressed
|
|
7719 temporary files only).
|
|
7720
|
|
7721 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
|
|
7722 split up into several smaller files to run concurrently and the output files are to be merged.
|
|
7723
|
|
7724 --quiet Print nothing besides alignments.
|
|
7725
|
|
7726 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
|
|
7727 of SAM format output.
|
|
7728
|
|
7729 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
|
|
7730 appear as they did in the input, without any translation of quality values that may have
|
|
7731 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
|
|
7732 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
|
|
7733 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
|
|
7734 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
|
|
7735
|
|
7736 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
|
|
7737 mismatches or other reads that fail to align uniquely to a file in the output directory.
|
|
7738 Written reads will appear as they did in the input, without any of the translation of quality
|
|
7739 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
|
|
7740 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
|
|
7741 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
|
|
7742
|
|
7743 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
|
|
7744 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
|
|
7745 to create it first. The path to the output folder can be either relative or absolute.
|
|
7746
|
|
7747 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
|
|
7748 the specified folder does not exist, Bismark will attempt to create it first. The path to the
|
|
7749 temporary folder can be either relative or absolute.
|
|
7750
|
|
7751 --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the
|
|
7752 alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is
|
|
7753 just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions.
|
|
7754 The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches'
|
|
7755 and 'XB:Z:number of mismatches' for read 2 of paired-end reads.
|
|
7756
|
|
7757 --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk
|
|
7758 space. This option is available for most alignment modes but is not available for paired-end FastA
|
|
7759 files. This option might be somewhat slower than writing out uncompressed files, but this awaits
|
|
7760 further testing.
|
|
7761
|
|
7762 --bam The output will be written out in BAM format instead of the default SAM format. Bismark will
|
|
7763 attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't
|
|
7764 been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,
|
|
7765 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file).
|
|
7766
|
|
7767 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
|
|
7768 explicitly if Samtools is in the PATH already.
|
|
7769
|
3
|
7770 --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For
|
|
7771 example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc.
|
|
7772
|
|
7773 --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition,
|
|
7774 this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since
|
|
7775 both the appended read IDs and custom FLAG values may cause problems with some downstream tools
|
|
7776 such as Picard, new defaults were implemented as of version 0.8.3.
|
|
7777
|
|
7778
|
|
7779 default old_flag
|
|
7780 =================== ===================
|
|
7781 Read 1 Read 2 Read 1 Read 2
|
|
7782
|
|
7783 OT: 99 147 67 131
|
|
7784
|
|
7785 OB: 83 163 115 179
|
|
7786
|
|
7787 CTOT: 99 147 67 131
|
|
7788
|
|
7789 CTOB: 83 163 115 179
|
|
7790
|
0
|
7791
|
|
7792
|
|
7793 Other:
|
|
7794
|
|
7795 -h/--help Displays this help file.
|
|
7796
|
|
7797 -v/--version Displays version information.
|
|
7798
|
|
7799
|
|
7800 BOWTIE 2 SPECIFIC OPTIONS
|
|
7801
|
|
7802 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
|
|
7803 alignments, i.e. searches for alignments involving all read characters (also called
|
|
7804 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
|
|
7805 and/or quality trimmed where appropriate. Default: off.
|
|
7806
|
|
7807 Bowtie 2 alignment options:
|
|
7808
|
|
7809 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
|
|
7810 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
|
|
7811 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
|
|
7812 Bowtie 1 see -n).
|
|
7813
|
|
7814 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
|
|
7815 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
|
|
7816 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
|
|
7817 Bowtie 1 see -l).
|
|
7818
|
|
7819 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
|
|
7820 position to be the highest possible, regardless of the actual value. I.e. input is treated
|
|
7821 as though all quality values are high. This is also the default behavior when the input
|
|
7822 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
|
|
7823
|
|
7824
|
|
7825 Bowtie 2 paired-end options:
|
|
7826
|
|
7827 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
|
|
7828 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
|
|
7829 and on by default.
|
|
7830
|
|
7831 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
|
|
7832 A discordant alignment is an alignment where both mates align uniquely, but that does not
|
|
7833 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
|
|
7834 and it is on by default.
|
|
7835
|
|
7836
|
|
7837 Bowtie 2 effort options:
|
|
7838
|
|
7839 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
|
|
7840 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
|
|
7841 new second-best alignment. Default: 15.
|
|
7842
|
|
7843 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
|
|
7844 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
|
|
7845 mismatches allowed) at different offsets and searches for more alignments. A read is considered
|
|
7846 to have repetitive seeds if the total number of seed hits divided by the number of seeds
|
|
7847 that aligned at least once is greater than 300. Default: 2.
|
|
7848
|
|
7849 Bowtie 2 parallelization options:
|
|
7850
|
|
7851
|
|
7852 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
|
|
7853 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
|
|
7854 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
|
|
7855 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
|
|
7856 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
|
|
7857 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
|
|
7858 automatically use the option '--reorder', which guarantees that output SAM records are printed in
|
|
7859 an order corresponding to the order of the reads in the original input file, even when -p is set
|
|
7860 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
|
|
7861 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
|
|
7862 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
|
|
7863 correspond to input order in that case.
|
|
7864
|
|
7865 Bowtie 2 Scoring options:
|
|
7866
|
|
7867 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
|
|
7868 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
|
|
7869 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
|
|
7870 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
|
|
7871 L,0,-0.2.
|
|
7872
|
|
7873 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty
|
|
7874 of <int1> + N * <int2>. Default: 5, 3.
|
|
7875
|
|
7876 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets
|
|
7877 a penalty of <int1> + N * <int2>. Default: 5, 3.
|
|
7878
|
|
7879
|
|
7880 Bowtie 2 Reporting options:
|
|
7881
|
|
7882 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
|
|
7883 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
|
|
7884 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
|
|
7885 effort expended to find valid alignments.
|
|
7886
|
|
7887 For reference, this used to be the old (now deprecated) description of -M:
|
|
7888 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
|
|
7889 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
|
|
7890 happens first. Only the best alignment is reported. Information from the other alignments is used to
|
|
7891 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
|
|
7892 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
|
|
7893 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
|
|
7894 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
|
|
7895 always used and its default value is set to 10.
|
|
7896
|
|
7897
|
|
7898 'VANILLA' Bismark OUTPUT:
|
|
7899
|
|
7900 Single-end output format (tab-separated):
|
|
7901
|
|
7902 (1) <seq-ID>
|
|
7903 (2) <read alignment strand>
|
|
7904 (3) <chromosome>
|
|
7905 (4) <start position>
|
|
7906 (5) <end position>
|
|
7907 (6) <observed bisulfite sequence>
|
|
7908 (7) <equivalent genomic sequence>
|
|
7909 (8) <methylation call>
|
|
7910 (9) <read conversion
|
|
7911 (10) <genome conversion>
|
|
7912 (11) <read quality score (Phred33)>
|
|
7913
|
|
7914
|
|
7915 Paired-end output format (tab-separated):
|
|
7916 (1) <seq-ID>
|
|
7917 (2) <read 1 alignment strand>
|
|
7918 (3) <chromosome>
|
|
7919 (4) <start position>
|
|
7920 (5) <end position>
|
|
7921 (6) <observed bisulfite sequence 1>
|
|
7922 (7) <equivalent genomic sequence 1>
|
|
7923 (8) <methylation call 1>
|
|
7924 (9) <observed bisulfite sequence 2>
|
|
7925 (10) <equivalent genomic sequence 2>
|
|
7926 (11) <methylation call 2>
|
|
7927 (12) <read 1 conversion
|
|
7928 (13) <genome conversion>
|
|
7929 (14) <read 1 quality score (Phred33)>
|
|
7930 (15) <read 2 quality score (Phred33)>
|
|
7931
|
|
7932
|
|
7933 Bismark SAM OUTPUT (default):
|
|
7934
|
|
7935 (1) QNAME (seq-ID)
|
|
7936 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
|
|
7937 (3) RNAME (chromosome)
|
|
7938 (4) POS (start position)
|
|
7939 (5) MAPQ (always 255)
|
|
7940 (6) CIGAR
|
|
7941 (7) RNEXT
|
|
7942 (8) PNEXT
|
|
7943 (9) TLEN
|
|
7944 (10) SEQ
|
|
7945 (11) QUAL (Phred33 scale)
|
|
7946 (12) NM-tag (edit distance to the reference)
|
|
7947 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
|
|
7948 (14) XM-tag (methylation call string)
|
|
7949 (15) XR-tag (read conversion state for the alignment)
|
|
7950 (16) XG-tag (genome conversion state for the alignment)
|
|
7951 (17) XA/XB-tag (non-bisulfite mismatches) (optional!)
|
|
7952
|
|
7953 Each read of paired-end alignments is written out in a separate line in the above format.
|
|
7954
|
|
7955
|
3
|
7956 Last edited on 07 October 2013.
|
0
|
7957
|
|
7958 HOW_TO
|
|
7959 }
|