comparison old/bismark @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/perl --
2 use strict;
3 use warnings;
4 use IO::Handle;
5 use Cwd;
6 $|++;
7 use Getopt::Long;
8
9
10 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk)
11
12 ## This program is free software: you can redistribute it and/or modify
13 ## it under the terms of the GNU General Public License as published by
14 ## the Free Software Foundation, either version 3 of the License, or
15 ## (at your option) any later version.
16
17 ## This program is distributed in the hope that it will be useful,
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ## GNU General Public License for more details.
21
22 ## You should have received a copy of the GNU General Public License
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
24
25
26 my $parent_dir = getcwd;
27 my $bismark_version = 'v0.10.0';
28 my $command_line = join (" ",@ARGV);
29
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
31 foreach my $arg (@ARGV){
32 if ($arg eq '--solexa1.3-quals'){
33 $arg = '--phred64-quals';
34 }
35 }
36 my @filenames; # will be populated by processing the command line
37
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line();
39
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
42 my %counting; # counting various events
43
44 my $seqID_contains_tabs;
45
46 foreach my $filename (@filenames){
47
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
49 ### resetting the counting hash and fhs
50 reset_counters_and_fhs($filename);
51 $seqID_contains_tabs = 0;
52
53 ### PAIRED-END ALIGNMENTS
54 if ($filename =~ ','){
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
56
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
61
62 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
63
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
65 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
66
67 ### additional variables only for paired-end alignments
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
69
70 ### FastA format
71 if ($sequence_file_format eq 'FASTA'){
72 warn "Input files are in FastA format\n";
73
74 if ($directional){
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
77
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
80 $fhs[1]->{inputfile_1} = undef;
81 $fhs[1]->{inputfile_2} = undef;
82 $fhs[2]->{inputfile_1} = undef;
83 $fhs[2]->{inputfile_2} = undef;
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
86 }
87 else{
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
90
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
99 }
100
101 if ($bowtie2){
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
103 }
104 else{
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
106 }
107 }
108
109 ### FastQ format
110 else{
111 warn "Input files are in FastQ format\n";
112 if ($directional){
113 if ($bowtie2){
114 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
115 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
116
117 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
118 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
119 $fhs[1]->{inputfile_1} = undef;
120 $fhs[1]->{inputfile_2} = undef;
121 $fhs[2]->{inputfile_1} = undef;
122 $fhs[2]->{inputfile_2} = undef;
123 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
124 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
125 }
126 else{ # Bowtie 1 alignments
127 if ($gzip){
128 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
129
130 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
131 $fhs[0]->{inputfile_2} = undef; # no longer needed
132 $fhs[1]->{inputfile_1} = undef;
133 $fhs[1]->{inputfile_2} = undef;
134 $fhs[2]->{inputfile_1} = undef;
135 $fhs[2]->{inputfile_2} = undef;
136 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
137 $fhs[3]->{inputfile_2} = undef; # no longer needed
138 }
139 else{
140 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
141 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
142
143 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
144 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
145 $fhs[1]->{inputfile_1} = undef;
146 $fhs[1]->{inputfile_2} = undef;
147 $fhs[2]->{inputfile_1} = undef;
148 $fhs[2]->{inputfile_2} = undef;
149 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
150 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
151 }
152 }
153 }
154 elsif($pbat){ # PBAT-Seq
155 ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1
156 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
157 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
158
159 $fhs[0]->{inputfile_1} = undef;
160 $fhs[0]->{inputfile_2} = undef;
161 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
162 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
163 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
164 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
165 $fhs[3]->{inputfile_1} = undef;
166 $fhs[3]->{inputfile_2} = undef;
167 }
168 else{
169 if ($bowtie2){
170 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
171 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
172
173 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
174 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
175 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
176 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
177 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
178 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
179 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
180 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
181 }
182 else{ # Bowtie 1 alignments
183 if ($gzip){
184 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
185
186 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
187 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files
188 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
189 $fhs[1]->{inputfile_2} = undef;
190 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
191 $fhs[2]->{inputfile_2} = undef;
192 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
193 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files
194 }
195 else{ #uncompressed temp files
196 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
197 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
198
199 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
200 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
201 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
202 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
203 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
204 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
205 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
206 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
207 }
208 }
209 }
210 if ($bowtie2){
211 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
212 }
213 else{
214 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
215 }
216 }
217 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
218 }
219
220 ### Else we are performing SINGLE-END ALIGNMENTS
221 else{
222 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
223 ### Initialising bisulfite conversion filenames
224 my ($C_to_T_infile,$G_to_A_infile);
225
226
227 ### FastA format
228 if ($sequence_file_format eq 'FASTA'){
229 warn "Inut file is in FastA format\n";
230 if ($directional){
231 ($C_to_T_infile) = biTransformFastAFiles ($filename);
232 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
233 }
234 else{
235 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
236 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
237 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
238 }
239
240 ### Creating 4 different bowtie filehandles and storing the first entry
241 if ($bowtie2){
242 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
243 }
244 else{
245 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
246 }
247 }
248
249 ## FastQ format
250 else{
251 warn "Input file is in FastQ format\n";
252 if ($directional){
253 ($C_to_T_infile) = biTransformFastQFiles ($filename);
254 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
255 }
256 elsif($pbat){
257 ($G_to_A_infile) = biTransformFastQFiles ($filename);
258 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files
259 }
260 else{
261 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
262 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
263 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
264 }
265
266 ### Creating up to 4 different bowtie filehandles and storing the first entry
267 if ($bowtie2){
268 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
269 }
270 elsif ($pbat){
271 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile);
272 }
273 else{
274 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
275 }
276 }
277
278 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
279
280 }
281 }
282
283 sub start_methylation_call_procedure_single_ends {
284 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
285 my ($dir,$filename);
286
287 if ($sequence_file =~ /\//){
288 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
289 }
290 else{
291 $filename = $sequence_file;
292 }
293
294 ### printing all alignments to a results file
295 my $outfile = $filename;
296 if ($prefix){
297 $outfile = "$prefix.$outfile";
298 }
299
300
301 if ($bowtie2){ # SAM format is the default for Bowtie 2
302 $outfile =~ s/$/_bismark_bt2.sam/;
303 }
304 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
305 $outfile =~ s/$/_bismark.txt/;
306 }
307 else{ # SAM is the default output
308 $outfile =~ s/$/_bismark.sam/;
309 }
310
311 $bam = 0 unless (defined $bam);
312
313 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
314 $outfile =~ s/sam/bam/;
315 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
316 }
317 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
318 $outfile .= '.gz';
319 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
320 }
321 else{ # uncompressed ouput, default
322 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
323 }
324
325 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n";
326 sleep(1);
327
328 if ($vanilla){
329 print OUT "Bismark version: $bismark_version\n";
330 }
331
332 ### printing alignment and methylation call summary to a report file
333 my $reportfile = $filename;
334 if ($prefix){
335 $reportfile = "$prefix.$reportfile";
336 }
337
338 if ($bowtie2){
339 $reportfile =~ s/$/_bismark_bt2_SE_report.txt/;
340 }
341 else{
342 $reportfile =~ s/$/_bismark_SE_report.txt/;
343 }
344
345 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
346 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
347
348 if ($unmapped){
349 my $unmapped_file = $filename;
350 if ($prefix){
351 $unmapped_file = "$prefix.$unmapped_file";
352 }
353
354 $unmapped_file =~ s/$/_unmapped_reads.txt/;
355 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
356 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
357 }
358 if ($ambiguous){
359 my $ambiguous_file = $filename;
360 if ($prefix){
361 $ambiguous_file = "$prefix.$ambiguous_file";
362 }
363 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
364 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
365 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
366 }
367
368 if ($directional){
369 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
370 }
371 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
372
373
374 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
375 unless (%chromosomes){
376 my $cwd = getcwd; # storing the path of the current working directory
377 print "Current working directory is: $cwd\n\n";
378 read_genome_into_memory($cwd);
379 }
380
381 unless ($vanilla or $sam_no_hd){
382 generate_SAM_header();
383 }
384
385 ### Input file is in FastA format
386 if ($sequence_file_format eq 'FASTA'){
387 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
388 }
389 ### Input file is in FastQ format
390 else{
391 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
392 }
393 }
394
395 sub start_methylation_call_procedure_paired_ends {
396 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
397
398 my ($dir_1,$filename_1);
399
400 if ($sequence_file_1 =~ /\//){
401 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
402 }
403 else{
404 $filename_1 = $sequence_file_1;
405 }
406
407 my ($dir_2,$filename_2);
408
409 if ($sequence_file_2 =~ /\//){
410 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
411 }
412 else{
413 $filename_2 = $sequence_file_2;
414 }
415
416 ### printing all alignments to a results file
417 my $outfile = $filename_1;
418
419 if ($prefix){
420 $outfile = "$prefix.$outfile";
421 }
422
423 if ($bowtie2){ # SAM format is the default Bowtie 2 output
424 $outfile =~ s/$/_bismark_bt2_pe.sam/;
425 }
426 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
427 $outfile =~ s/$/_bismark_pe.txt/;
428 }
429 else{ # SAM format is the default Bowtie 1 output
430 $outfile =~ s/$/_bismark_pe.sam/;
431 }
432
433 $bam = 0 unless (defined $bam);
434
435 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
436 $outfile =~ s/sam/bam/;
437 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
438 }
439 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
440 $outfile .= '.gz';
441 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
442 }
443 else{ # uncompressed ouput, default
444 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
445 }
446
447 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n";
448 sleep(1);
449
450 if ($vanilla){
451 print OUT "Bismark version: $bismark_version\n";
452 }
453
454 ### printing alignment and methylation call summary to a report file
455 my $reportfile = $filename_1;
456 if ($prefix){
457 $reportfile = "$prefix.$reportfile";
458 }
459
460 if ($bowtie2){
461 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
462 }
463 else{
464 $reportfile =~ s/$/_bismark_PE_report.txt/;
465 }
466
467 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
468 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
469 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
470
471
472 ### Unmapped read output
473 if ($unmapped){
474 my $unmapped_1 = $filename_1;
475 my $unmapped_2 = $filename_2;
476 if ($prefix){
477 $unmapped_1 = "$prefix.$unmapped_1";
478 $unmapped_2 = "$prefix.$unmapped_2";
479 }
480 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
481 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
482 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
483 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
484 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
485 }
486
487 if ($ambiguous){
488 my $amb_1 = $filename_1;
489 my $amb_2 = $filename_2;
490 if ($prefix){
491 $amb_1 = "$prefix.$amb_1";
492 $amb_2 = "$prefix.$amb_2";
493 }
494
495 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
496 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
497 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
498 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
499 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
500 }
501
502 if ($directional){
503 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
504 }
505
506 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
507 unless (%chromosomes){
508 my $cwd = getcwd; # storing the path of the current working directory
509 print "Current working directory is: $cwd\n\n";
510 read_genome_into_memory($cwd);
511 }
512
513 unless ($vanilla or $sam_no_hd){
514 generate_SAM_header();
515 }
516
517 ### Input files are in FastA format
518 if ($sequence_file_format eq 'FASTA'){
519 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
520 }
521 ### Input files are in FastQ format
522 else{
523 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
524 }
525 }
526
527 sub print_final_analysis_report_single_end{
528 my ($C_to_T_infile,$G_to_A_infile) = @_;
529 ### All sequences from the original sequence file have been analysed now
530 ### deleting temporary C->T or G->A infiles
531
532 if ($directional){
533 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
534 if ($deletion_successful == 1){
535 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
536 }
537 else{
538 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
539 }
540 }
541 elsif ($pbat){
542 my $deletion_successful = unlink "$temp_dir$G_to_A_infile";
543 if ($deletion_successful == 1){
544 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n";
545 }
546 else{
547 warn "Could not delete temporary file $G_to_A_infile properly $!\n";
548 }
549 }
550 else{
551 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
552 if ($deletion_successful == 2){
553 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
554 }
555 else{
556 warn "Could not delete temporary files properly $!\n";
557 }
558 }
559
560 ### printing a final report for the alignment procedure
561 print REPORT "Final Alignment report\n",'='x22,"\n";
562 warn "Final Alignment report\n",'='x22,"\n";
563 # foreach my $index (0..$#fhs){
564 # print "$fhs[$index]->{name}\n";
565 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
566 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
567 # }
568
569 ### printing a final report for the methylation call procedure
570 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
571 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
572 my $percent_alignable_sequences;
573
574 if ($counting{sequences_count} == 0){
575 $percent_alignable_sequences = 0;
576 }
577 else{
578 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
579 }
580
581 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
582 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
583
584 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
585 ### only calculating the percentage if there were any overruled alignments
586 if ($counting{low_complexity_alignments_overruled_count}){
587 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
588 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
589 }
590
591 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
592 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
593 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
594 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
595 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
596
597 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
598 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
599 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
600 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
601 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
602
603 if ($directional){
604 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
605 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
606 }
607
608 ### detailed information about Cs analysed
609 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
610 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
611 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
612 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
613 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
614 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
615 if ($bowtie2){
616 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
617 }
618 warn "\n";
619
620 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
621 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
622 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
623 if ($bowtie2){
624 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
625 }
626 warn "\n";
627
628 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
629 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
630
631 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
632 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
633 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
634 if ($bowtie2){
635 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
636 }
637 print REPORT "\n";
638
639 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
640 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
641 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
642 if ($bowtie2){
643 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
644 }
645 print REPORT "\n";
646
647 my $percent_meCHG;
648 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
649 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
650 }
651
652 my $percent_meCHH;
653 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
654 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
655 }
656
657 my $percent_meCpG;
658 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
659 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
660 }
661
662 my $percent_meC_unknown;
663 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
664 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
665 }
666
667
668 ### printing methylated CpG percentage if applicable
669 if ($percent_meCpG){
670 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
671 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
672 }
673 else{
674 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
675 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
676 }
677
678 ### printing methylated C percentage (CHG context) if applicable
679 if ($percent_meCHG){
680 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
681 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
682 }
683 else{
684 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
685 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
686 }
687
688 ### printing methylated C percentage (CHH context) if applicable
689 if ($percent_meCHH){
690 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
691 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
692 }
693 else{
694 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
695 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
696 }
697
698 ### printing methylated C percentage (Unknown C context) if applicable
699 if ($bowtie2){
700 if ($percent_meC_unknown){
701 warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
702 print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
703 }
704 else{
705 warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
706 print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
707 }
708 }
709 print REPORT "\n\n";
710 warn "\n\n";
711
712 if ($seqID_contains_tabs){
713 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
714 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
715 }
716
717
718 ###########################################################################################################################################
719 ### create pie-chart with mapping stats
720 ###########################################################################################################################################
721
722
723 my $filename;
724 if ($pbat){
725 $filename = $G_to_A_infile;
726 }
727 else{
728 $filename = $C_to_T_infile;
729 }
730
731 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
732 $pie_chart =~ s/gz$//;
733 $pie_chart =~ s/_C_to_T\.fastq$//;
734 $pie_chart =~ s/_G_to_A\.fastq$//;
735
736 # if ($prefix){
737 # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation
738 # }
739 $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png";
740
741
742 #Check whether the module GD::Graph is installed
743 my $gd_graph_installed = 0;
744 eval{
745 require GD::Graph::pie;
746 GD::Graph::pie->import();
747 };
748
749 unless($@) {
750 $gd_graph_installed = 1;
751 }
752 else{
753 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
754 sleep(2);
755 }
756
757 if ($gd_graph_installed){
758 warn "Generating pie chart\n\n";
759 sleep(1);
760 my $graph = GD::Graph::pie->new(600,600);
761
762 my $percent_unaligned;
763 my $percent_multiple;
764 my $percent_unextractable;
765
766 if ($counting{sequences_count}){
767 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
768 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
769 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
770 }
771 else{
772 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
773 }
774
775 my @aln_stats = (
776 ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
777 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
778 );
779
780 $graph->set(
781 start_angle => 180,
782 '3d' => 0,
783 label => 'Alignment stats (single-end)',
784 suppress_angle => 2, # Only label slices of sufficient size
785 transparent => 0,
786 dclrs => [ qw(red lorange dgreen cyan) ],
787 ) or die $graph->error;
788
789 my $gd = $graph->plot(\@aln_stats) or die $graph->error;
790
791 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
792 binmode PIE;
793 print PIE $gd->png;
794 }
795
796 warn "====================\nBismark run complete\n====================\n\n";
797
798 }
799
800
801 sub print_final_analysis_report_paired_ends{
802 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
803 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
804 if ($directional){
805 if ($G_to_A_infile_2){
806 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
807 if ($deletion_successful == 2){
808 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
809 }
810 else{
811 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
812 }
813 }
814 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete
815 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1";
816 if ($deletion_successful == 1){
817 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n";
818 }
819 else{
820 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n";
821 }
822 }
823 }
824 else{
825 if ($G_to_A_infile_2 and $C_to_T_infile_2){
826 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
827 if ($deletion_successful == 4){
828 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
829 }
830 else{
831 warn "Could not delete temporary files properly: $!\n";
832 }
833 }
834 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete
835 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1";
836 if ($deletion_successful == 2){
837 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n";
838 }
839 else{
840 warn "Could not delete temporary files properly: $!\n";
841 }
842 }
843 }
844
845 ### printing a final report for the alignment procedure
846 warn "Final Alignment report\n",'='x22,"\n";
847 print REPORT "Final Alignment report\n",'='x22,"\n";
848 # foreach my $index (0..$#fhs){
849 # print "$fhs[$index]->{name}\n";
850 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
851 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
852 # }
853
854 ### printing a final report for the methylation call procedure
855 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
856 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
857
858 my $percent_alignable_sequence_pairs;
859 if ($counting{sequences_count} == 0){
860 $percent_alignable_sequence_pairs = 0;
861 }
862 else{
863 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
864 }
865 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
866 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
867
868 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
869 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
870 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
871 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
872 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
873
874
875 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
876 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
877 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
878 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
879 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
880 ### detailed information about Cs analysed
881
882 if ($directional){
883 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
884 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
885 }
886
887 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
888 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
889
890 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
891 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
892 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
893 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
894 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
895 if ($bowtie2){
896 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
897 }
898 warn "\n";
899
900 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
901 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
902 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
903 if ($bowtie2){
904 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
905 }
906 warn "\n";
907
908 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
909 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
910 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
911 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
912 if ($bowtie2){
913 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
914 }
915 print REPORT "\n";
916
917 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
918 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
919 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
920 if ($bowtie2){
921 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
922 }
923 print REPORT "\n";
924
925 my $percent_meCHG;
926 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
927 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
928 }
929
930 my $percent_meCHH;
931 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
932 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
933 }
934
935 my $percent_meCpG;
936 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
937 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
938 }
939
940 my $percent_meC_unknown;
941 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
942 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
943 }
944
945
946 ### printing methylated CpG percentage if applicable
947 if ($percent_meCpG){
948 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
949 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
950 }
951 else{
952 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
953 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
954 }
955
956 ### printing methylated C percentage in CHG context if applicable
957 if ($percent_meCHG){
958 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
959 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
960 }
961 else{
962 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
963 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
964 }
965
966 ### printing methylated C percentage in CHH context if applicable
967 if ($percent_meCHH){
968 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
969 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
970 }
971 else{
972 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
973 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
974 }
975
976 ### printing methylated C percentage (Unknown C context) if applicable
977 if ($bowtie2){
978 if ($percent_meC_unknown){
979 warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
980 print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
981 }
982 else{
983 warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
984 print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
985 }
986 }
987 print REPORT "\n\n";
988 warn "\n\n";
989
990
991 ############################################################################################################################################
992 ### create pie-chart with mapping stats
993 ###########################################################################################################################################
994
995 my $filename;
996 if ($pbat){
997 $filename = $G_to_A_infile_1;
998 }
999 else{
1000 $filename = $C_to_T_infile_1;
1001 }
1002
1003 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified
1004 $pie_chart =~ s/gz$//;
1005 $pie_chart =~ s/_C_to_T.fastq$//;
1006 $pie_chart =~ s/_G_to_A.fastq$//;
1007 ### special format for gzipped PE Bowtie1 files
1008 $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//;
1009 $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//;
1010
1011 if ($prefix){
1012 # prefix is now being prepended to the temp files already
1013 # $pie_chart = "$prefix.$pie_chart";
1014 }
1015 $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png";
1016
1017 #Check whether the module GD::Graph is installed
1018 my $gd_graph_installed = 0;
1019 eval{
1020 require GD::Graph::pie;
1021 GD::Graph::pie->import();
1022 };
1023
1024 unless($@) {
1025 $gd_graph_installed = 1;
1026 }
1027 else{
1028 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n";
1029 sleep(2);
1030 }
1031
1032 if ($gd_graph_installed){
1033 warn "Generating pie chart\n\n";
1034 sleep(1);
1035 my $graph = GD::Graph::pie->new(600,600);
1036
1037 my $percent_unaligned;
1038 my $percent_multiple;
1039 my $percent_unextractable;
1040
1041 if ($counting{sequences_count}){
1042 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count});
1043 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count});
1044 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count});
1045 }
1046 else{
1047 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A';
1048 }
1049
1050 my @aln_stats = (
1051 ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"],
1052 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}],
1053 );
1054
1055 # push @{$mbias_read1[0]},$pos;
1056
1057 $graph->set(
1058 start_angle => 180,
1059 '3d' => 0,
1060 label => 'Alignment stats (paired-end)',
1061 suppress_angle => 2, # Only label slices of sufficient size
1062 transparent => 0,
1063 dclrs => [ qw(red lorange dgreen cyan) ],
1064 ) or die $graph->error;
1065
1066 my $gd = $graph->plot(\@aln_stats) or die $graph->error;
1067
1068 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n";
1069 binmode PIE;
1070 print PIE $gd->png;
1071 }
1072
1073 warn "====================\nBismark run complete\n====================\n\n";
1074
1075 }
1076
1077 sub process_single_end_fastA_file_for_methylation_call{
1078 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
1079 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
1080 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
1081 ### the C->T or G->A version
1082
1083 ### gzipped version of the infile
1084 if ($sequence_file =~ /\.gz$/){
1085 open (IN,"zcat $sequence_file |") or die $!;
1086 }
1087 else{
1088 open (IN,$sequence_file) or die $!;
1089 }
1090
1091 my $count = 0;
1092
1093 warn "\nReading in the sequence file $sequence_file\n";
1094 while (1) {
1095 # last if ($counting{sequences_count} > 100);
1096 my $identifier = <IN>;
1097 my $sequence = <IN>;
1098 last unless ($identifier and $sequence);
1099
1100 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
1101
1102 ++$count;
1103
1104 if ($skip){
1105 next unless ($count > $skip);
1106 }
1107 if ($upto){
1108 last if ($count > $upto);
1109 }
1110
1111 $counting{sequences_count}++;
1112 if ($counting{sequences_count}%1000000==0) {
1113 warn "Processed $counting{sequences_count} sequences so far\n";
1114 }
1115 chomp $sequence;
1116 chomp $identifier;
1117
1118 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
1119
1120 my $return;
1121 if ($bowtie2){
1122 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
1123 }
1124 else{
1125 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
1126 }
1127
1128 unless ($return){
1129 $return = 0;
1130 }
1131
1132 # print the sequence to ambiguous.out if --ambiguous was specified
1133 if ($ambiguous and $return == 2){
1134 print AMBIG ">$identifier\n";
1135 print AMBIG "$sequence\n";
1136 }
1137
1138 # print the sequence to <unmapped.out> file if --un was specified
1139 elsif ($unmapped and $return == 1){
1140 print UNMAPPED ">$identifier\n";
1141 print UNMAPPED "$sequence\n";
1142 }
1143 }
1144 print "Processed $counting{sequences_count} sequences in total\n\n";
1145
1146 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
1147
1148 }
1149
1150 sub process_single_end_fastQ_file_for_methylation_call{
1151 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
1152 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
1153 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
1154 ### the C->T or G->A version
1155
1156 ### gzipped version of the infile
1157 if ($sequence_file =~ /\.gz$/){
1158 open (IN,"zcat $sequence_file |") or die $!;
1159 }
1160 else{
1161 open (IN,$sequence_file) or die $!;
1162 }
1163
1164 my $count = 0;
1165
1166 warn "\nReading in the sequence file $sequence_file\n";
1167 while (1) {
1168 my $identifier = <IN>;
1169 my $sequence = <IN>;
1170 my $identifier_2 = <IN>;
1171 my $quality_value = <IN>;
1172 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
1173
1174 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
1175
1176 ++$count;
1177
1178 if ($skip){
1179 next unless ($count > $skip);
1180 }
1181 if ($upto){
1182 last if ($count > $upto);
1183 }
1184
1185 $counting{sequences_count}++;
1186
1187 if ($counting{sequences_count}%1000000==0) {
1188 warn "Processed $counting{sequences_count} sequences so far\n";
1189 }
1190 chomp $sequence;
1191 chomp $identifier;
1192 chomp $quality_value;
1193
1194 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
1195
1196 my $return;
1197 if ($bowtie2){
1198 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
1199 }
1200 else{
1201 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
1202 }
1203
1204 unless ($return){
1205 $return = 0;
1206 }
1207
1208 # print the sequence to ambiguous.out if --ambiguous was specified
1209 if ($ambiguous and $return == 2){
1210 print AMBIG "\@$identifier\n";
1211 print AMBIG "$sequence\n";
1212 print AMBIG $identifier_2;
1213 print AMBIG "$quality_value\n";
1214 }
1215
1216 # print the sequence to <unmapped.out> file if --un was specified
1217 elsif ($unmapped and $return == 1){
1218 print UNMAPPED "\@$identifier\n";
1219 print UNMAPPED "$sequence\n";
1220 print UNMAPPED $identifier_2;
1221 print UNMAPPED "$quality_value\n";
1222 }
1223 }
1224 print "Processed $counting{sequences_count} sequences in total\n\n";
1225
1226 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
1227
1228 }
1229
1230 sub process_fastA_files_for_paired_end_methylation_calls{
1231 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
1232 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
1233 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
1234 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
1235 ### converted genomes (either the C->T or G->A version)
1236
1237 ### gzipped version of the infiles
1238 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
1239 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
1240 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
1241 }
1242 else{
1243 open (IN1,$sequence_file_1) or die $!;
1244 open (IN2,$sequence_file_2) or die $!;
1245 }
1246
1247 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
1248 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
1249
1250 my $count = 0;
1251
1252 while (1) {
1253 # reading from the first input file
1254 my $identifier_1 = <IN1>;
1255 my $sequence_1 = <IN1>;
1256 # reading from the second input file
1257 my $identifier_2 = <IN2>;
1258 my $sequence_2 = <IN2>;
1259 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
1260
1261 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
1262 $identifier_2 = fix_IDs($identifier_2);
1263
1264 ++$count;
1265
1266 if ($skip){
1267 next unless ($count > $skip);
1268 }
1269 if ($upto){
1270 last if ($count > $upto);
1271 }
1272
1273 $counting{sequences_count}++;
1274 if ($counting{sequences_count}%1000000==0) {
1275 warn "Processed $counting{sequences_count} sequence pairs so far\n";
1276 }
1277 my $orig_identifier_1 = $identifier_1;
1278 my $orig_identifier_2 = $identifier_2;
1279
1280 chomp $sequence_1;
1281 chomp $identifier_1;
1282 chomp $sequence_2;
1283 chomp $identifier_2;
1284
1285 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
1286
1287 my $return;
1288 if ($bowtie2){
1289 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
1290 }
1291 else{
1292 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
1293 }
1294
1295 unless ($return){
1296 $return = 0;
1297 }
1298
1299 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
1300 if ($ambiguous and $return == 2){
1301 print AMBIG_1 $orig_identifier_1;
1302 print AMBIG_1 "$sequence_1\n";
1303 print AMBIG_2 $orig_identifier_2;
1304 print AMBIG_2 "$sequence_2\n";
1305 }
1306
1307 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
1308 elsif ($unmapped and $return == 1){
1309 print UNMAPPED_1 $orig_identifier_1;
1310 print UNMAPPED_1 "$sequence_1\n";
1311 print UNMAPPED_2 $orig_identifier_2;
1312 print UNMAPPED_2 "$sequence_2\n";
1313 }
1314 }
1315
1316 warn "Processed $counting{sequences_count} sequences in total\n\n";
1317
1318 close OUT or die $!;
1319
1320 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
1321
1322 }
1323
1324 sub process_fastQ_files_for_paired_end_methylation_calls{
1325 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
1326 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
1327 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
1328 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
1329 ### of the converted genomes (either C->T or G->A version)
1330
1331 ### gzipped version of the infiles
1332 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
1333 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
1334 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
1335 }
1336 else{
1337 open (IN1,$sequence_file_1) or die $!;
1338 open (IN2,$sequence_file_2) or die $!;
1339 }
1340
1341 my $count = 0;
1342
1343 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
1344 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
1345 while (1) {
1346 # reading from the first input file
1347 my $identifier_1 = <IN1>;
1348 my $sequence_1 = <IN1>;
1349 my $ident_1 = <IN1>; # not needed
1350 my $quality_value_1 = <IN1>; # not needed
1351 # reading from the second input file
1352 my $identifier_2 = <IN2>;
1353 my $sequence_2 = <IN2>;
1354 my $ident_2 = <IN2>; # not needed
1355 my $quality_value_2 = <IN2>; # not needed
1356 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
1357
1358 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
1359 $identifier_2 = fix_IDs($identifier_2);
1360
1361 ++$count;
1362
1363 if ($skip){
1364 next unless ($count > $skip);
1365 }
1366 if ($upto){
1367 last if ($count > $upto);
1368 }
1369
1370 $counting{sequences_count}++;
1371 if ($counting{sequences_count}%1000000==0) {
1372 warn "Processed $counting{sequences_count} sequence pairs so far\n";
1373 }
1374
1375 my $orig_identifier_1 = $identifier_1;
1376 my $orig_identifier_2 = $identifier_2;
1377
1378 chomp $sequence_1;
1379 chomp $identifier_1;
1380 chomp $sequence_2;
1381 chomp $identifier_2;
1382 chomp $quality_value_1;
1383 chomp $quality_value_2;
1384
1385 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
1386
1387 my $return;
1388 if ($bowtie2){
1389 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
1390 }
1391 else{
1392 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
1393 }
1394
1395 unless ($return){
1396 $return = 0;
1397 }
1398
1399 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
1400 if ($ambiguous and $return == 2){
1401 # seq_1
1402 print AMBIG_1 $orig_identifier_1;
1403 print AMBIG_1 "$sequence_1\n";
1404 print AMBIG_1 $ident_1;
1405 print AMBIG_1 "$quality_value_1\n";
1406 # seq_2
1407 print AMBIG_2 $orig_identifier_2;
1408 print AMBIG_2 "$sequence_2\n";
1409 print AMBIG_2 $ident_2;
1410 print AMBIG_2 "$quality_value_2\n";
1411 }
1412
1413 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
1414 elsif ($unmapped and $return == 1){
1415 # seq_1
1416 print UNMAPPED_1 $orig_identifier_1;
1417 print UNMAPPED_1 "$sequence_1\n";
1418 print UNMAPPED_1 $ident_1;
1419 print UNMAPPED_1 "$quality_value_1\n";
1420 # seq_2
1421 print UNMAPPED_2 $orig_identifier_2;
1422 print UNMAPPED_2 "$sequence_2\n";
1423 print UNMAPPED_2 $ident_2;
1424 print UNMAPPED_2 "$quality_value_2\n";
1425 }
1426 }
1427
1428 warn "Processed $counting{sequences_count} sequences in total\n\n";
1429
1430 close OUT or die $!;
1431
1432 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
1433
1434 }
1435
1436 sub check_bowtie_results_single_end{
1437 my ($sequence,$identifier,$quality_value) = @_;
1438
1439 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
1440 $quality_value = 'I'x(length$sequence);
1441 }
1442
1443 my %mismatches = ();
1444 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
1445 foreach my $index (0..$#fhs){
1446
1447 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1448 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
1449 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
1450 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1451 ###############################################################
1452 ### STEP I Now processing the alignment stored in last_line ###
1453 ###############################################################
1454 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
1455 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
1456 ### we only continue to extract useful information about this alignment if 1 was returned
1457 if ($valid_alignment_found_1 == 1){
1458 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
1459 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
1460 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
1461
1462 unless($mismatch_info){
1463 $mismatch_info = '';
1464 }
1465
1466 chomp $mismatch_info;
1467 my $chromosome;
1468 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1469 $chromosome = $mapped_chromosome;
1470 }
1471 else{
1472 die "Chromosome number extraction failed for $mapped_chromosome\n";
1473 }
1474 ### Now extracting the number of mismatches to the converted genome
1475 my $number_of_mismatches;
1476 if ($mismatch_info eq ''){
1477 $number_of_mismatches = 0;
1478 }
1479 elsif ($mismatch_info =~ /^\d/){
1480 my @mismatches = split (/,/,$mismatch_info);
1481 $number_of_mismatches = scalar @mismatches;
1482 }
1483 else{
1484 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
1485 }
1486 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1487 my $alignment_location = join (":",$chromosome,$position);
1488 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1489 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
1490 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
1491 ### number for the found alignment)
1492 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
1493 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
1494 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
1495 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
1496 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
1497 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
1498 }
1499 $number_of_mismatches = undef;
1500 ##################################################################################################################################################
1501 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
1502 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
1503 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
1504 ##################################################################################################################################################
1505 my $newline = $fhs[$index]->{fh}-> getline();
1506 if ($newline){
1507 my ($seq_id) = split (/\t/,$newline);
1508 $fhs[$index]->{last_seq_id} = $seq_id;
1509 $fhs[$index]->{last_line} = $newline;
1510 }
1511 else {
1512 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
1513 $fhs[$index]->{last_seq_id} = undef;
1514 $fhs[$index]->{last_line} = undef;
1515 next;
1516 }
1517 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
1518 ### we only continue to extract useful information about this second alignment if 1 was returned
1519 if ($valid_alignment_found_2 == 1){
1520 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
1521 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
1522 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
1523 unless($mismatch_info){
1524 $mismatch_info = '';
1525 }
1526 chomp $mismatch_info;
1527
1528 my $chromosome;
1529 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1530 $chromosome = $mapped_chromosome;
1531 }
1532 else{
1533 die "Chromosome number extraction failed for $mapped_chromosome\n";
1534 }
1535
1536 ### Now extracting the number of mismatches to the converted genome
1537 my $number_of_mismatches;
1538 if ($mismatch_info eq ''){
1539 $number_of_mismatches = 0;
1540 }
1541 elsif ($mismatch_info =~ /^\d/){
1542 my @mismatches = split (/,/,$mismatch_info);
1543 $number_of_mismatches = scalar @mismatches;
1544 }
1545 else{
1546 die "Something weird is going on with the mismatch field\n";
1547 }
1548 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1549 ### extracting the chromosome number from the bowtie output (see above)
1550 my $alignment_location = join (":",$chromosome,$position);
1551 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
1552 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
1553 ### case we are not writing the same entry out a second time.
1554 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
1555 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
1556 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
1557 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
1558 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
1559 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
1560 }
1561 ####################################################################################################################################
1562 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
1563 ####################################################################################################################################
1564 $newline = $fhs[$index]->{fh}-> getline();
1565 if ($newline){
1566 my ($seq_id) = split (/\t/,$newline);
1567 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
1568 $fhs[$index]->{last_seq_id} = $seq_id;
1569 $fhs[$index]->{last_line} = $newline;
1570 next;
1571 }
1572 else {
1573 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
1574 $fhs[$index]->{last_seq_id} = undef;
1575 $fhs[$index]->{last_line} = undef;
1576 next;
1577 }
1578 ### still within the 2nd sequence in correct orientation found
1579 }
1580 ### still withing the 1st sequence in correct orientation found
1581 }
1582 ### still within the if (last_seq_id eq identifier) condition
1583 }
1584 ### still within foreach index loop
1585 }
1586 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
1587 unless(%mismatches){
1588 $counting{no_single_alignment_found}++;
1589 if ($unmapped){
1590 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
1591 }
1592 else{
1593 return;
1594 }
1595 }
1596 #######################################################################################################################################################
1597 #######################################################################################################################################################
1598 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
1599 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
1600 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
1601 #######################################################################################################################################################
1602 #######################################################################################################################################################
1603 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
1604 my $sequence_fails = 0;
1605 ### Declaring an empty hash reference which will store all information we need for the methylation call
1606 my $methylation_call_params; # hash reference!
1607 ### sorting in ascending order
1608 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
1609
1610 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
1611 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
1612 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
1613 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
1614 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
1615 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
1616 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
1617 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
1618 }
1619 }
1620 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
1621 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
1622 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
1623 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
1624 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
1625 ### reaction. E.g.
1626 ### CAGTCACGCGCGCGCG will become
1627 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
1628 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
1629 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
1630 ### G->A conversion:
1631 ### highly methylated: CAATCACACACACACA
1632 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
1633 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
1634 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
1635 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
1636 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
1637 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
1638 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
1639 ### In the above example the number of transliterations required to transform the actual sequence
1640 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
1641 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
1642 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
1643 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
1644 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
1645 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
1646 my @three_candidate_seqs;
1647 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
1648 my $transliterations_performed;
1649 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
1650 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
1651 }
1652 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
1653 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
1654 }
1655 else{
1656 die "unexpected index number range $!\n";
1657 }
1658 push @three_candidate_seqs,{
1659 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
1660 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
1661 mismatch_number => $mismatch_number,
1662 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
1663 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
1664 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
1665 transliterations_performed => $transliterations_performed,
1666 };
1667 }
1668 ### sorting in ascending order for the lowest number of transliterations performed
1669 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
1670 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
1671 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
1672 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
1673 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
1674 if (($first_array_element*2) < $second_array_element){
1675 $counting{low_complexity_alignments_overruled_count}++;
1676 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
1677 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
1678 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
1679 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
1680 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
1681 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
1682 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
1683 }
1684 else{
1685 $sequence_fails = 1;
1686 }
1687 }
1688 else{
1689 $sequence_fails = 1;
1690 }
1691 ### after processing the alignment with the lowest number of mismatches we exit
1692 last;
1693 }
1694 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
1695 if ($sequence_fails == 1){
1696 $counting{unsuitable_sequence_count}++;
1697 if ($ambiguous){
1698 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
1699 }
1700 if ($unmapped){
1701 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
1702 }
1703 else{
1704 return 0; # => exits to next sequence (default)
1705 }
1706 }
1707
1708 ### --DIRECTIONAL
1709 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
1710 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
1711 if ($directional){
1712 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
1713 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
1714 $counting{alignments_rejected_count}++;
1715 return 0;
1716 }
1717 }
1718
1719 ### If the sequence has not been rejected so far it will have a unique best alignment
1720 $counting{unique_best_alignment_count}++;
1721 if ($pbat){
1722 extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params);
1723 }
1724 else{
1725 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
1726 }
1727
1728 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
1729 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
1730 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
1731 $counting{genomic_sequence_could_not_be_extracted_count}++;
1732 return 0;
1733 }
1734
1735 ### otherwise we are set to perform the actual methylation call
1736 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
1737
1738 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
1739 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
1740 }
1741
1742 sub check_bowtie_results_single_end_bowtie2{
1743 my ($sequence,$identifier,$quality_value) = @_;
1744
1745
1746 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
1747 $quality_value = 'I'x(length$sequence);
1748 }
1749
1750 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
1751 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
1752 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n";
1753
1754 my $alignment_ambiguous = 0;
1755
1756 my %alignments = ();
1757
1758 ### reading from the Bowtie 2 output filehandles
1759 foreach my $index (0..$#fhs){
1760 # print "Index: $index\n";
1761 # print "$fhs[$index]->{last_line}\n";
1762 # print "$fhs[$index]->{last_seq_id}\n";
1763 # sleep (1);
1764 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1765 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
1766
1767 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
1768 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
1769
1770 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1771 # SAM format specifications for Bowtie 2
1772 # (1) Name of read that aligned
1773 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
1774 # 1 The read is one of a pair
1775 # 2 The alignment is one end of a proper paired-end alignment
1776 # 4 The read has no reported alignments
1777 # 8 The read is one of a pair and has no reported alignments
1778 # 16 The alignment is to the reverse reference strand
1779 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
1780 # 64 The read is mate 1 in a pair
1781 # 128 The read is mate 2 in a pair
1782 # 256 The read has multiple mapping states
1783 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
1784 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
1785 # (5) Mapping quality (255 means MAPQ is not available)
1786 # (6) CIGAR string representation of alignment (* if unavailable)
1787 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
1788 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
1789 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
1790 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
1791 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
1792 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
1793 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
1794 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
1795 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
1796 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
1797 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
1798 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
1799 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
1800 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
1801 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
1802 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
1803
1804 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
1805
1806 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
1807 if ($flag == 4){
1808 ## reading in the next alignment, which must be the next sequence
1809 my $newline = $fhs[$index]->{fh}-> getline();
1810 if ($newline){
1811 chomp $newline;
1812 my ($seq_id) = split (/\t/,$newline);
1813 $fhs[$index]->{last_seq_id} = $seq_id;
1814 $fhs[$index]->{last_line} = $newline;
1815 if ($seq_id eq $identifier){
1816 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
1817 }
1818 next; # next instance
1819 }
1820 else{
1821 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1822 $fhs[$index]->{last_seq_id} = undef;
1823 $fhs[$index]->{last_line} = undef;
1824 next;
1825 }
1826 }
1827
1828 # if there are one or more proper alignments we can extract the chromosome number
1829 my $chromosome;
1830 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1831 $chromosome = $mapped_chromosome;
1832 }
1833 else{
1834 die "Chromosome number extraction failed for $mapped_chromosome\n";
1835 }
1836
1837 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
1838 my ($alignment_score,$second_best,$MD_tag);
1839 my @fields = split (/\t/,$fhs[$index]->{last_line});
1840
1841 foreach (11..$#fields){
1842 if ($fields[$_] =~ /AS:i:(.*)/){
1843 $alignment_score = $1;
1844 }
1845 elsif ($fields[$_] =~ /XS:i:(.*)/){
1846 $second_best = $1;
1847 }
1848 elsif ($fields[$_] =~ /MD:Z:(.*)/){
1849 $MD_tag = $1;
1850 }
1851 }
1852
1853 # warn "First best alignment_score is: '$alignment_score'\n";
1854 # warn "MD tag is: '$MD_tag'\n";
1855 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
1856
1857 if (defined $second_best){
1858 # warn "second best alignment_score is: '$second_best'\n\n";
1859
1860 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
1861 if ($alignment_score == $second_best){
1862 $alignment_ambiguous = 1;
1863 ## need to read and discard all additional ambiguous reads until we reach the next sequence
1864 until ($fhs[$index]->{last_seq_id} ne $identifier){
1865 my $newline = $fhs[$index]->{fh}-> getline();
1866 if ($newline){
1867 chomp $newline;
1868 my ($seq_id) = split (/\t/,$newline);
1869 $fhs[$index]->{last_seq_id} = $seq_id;
1870 $fhs[$index]->{last_line} = $newline;
1871 }
1872 else{
1873 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1874 $fhs[$index]->{last_seq_id} = undef;
1875 $fhs[$index]->{last_line} = undef;
1876 last; # break free in case we have reached the end of the alignment output
1877 }
1878 }
1879 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
1880 }
1881 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
1882
1883 my $alignment_location = join (":",$chromosome,$position);
1884
1885 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1886 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
1887 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
1888 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
1889
1890 unless (exists $alignments{$alignment_location}){
1891 $alignments{$alignment_location}->{seq_id} = $id;
1892 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
1893 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
1894 $alignments{$alignment_location}->{index} = $index;
1895 $alignments{$alignment_location}->{chromosome} = $chromosome;
1896 $alignments{$alignment_location}->{position} = $position;
1897 $alignments{$alignment_location}->{CIGAR} = $cigar;
1898 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
1899 }
1900
1901 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
1902 until ($fhs[$index]->{last_seq_id} ne $identifier){
1903 my $newline = $fhs[$index]->{fh}-> getline();
1904 if ($newline){
1905 chomp $newline;
1906 my ($seq_id) = split (/\t/,$newline);
1907 $fhs[$index]->{last_seq_id} = $seq_id;
1908 $fhs[$index]->{last_line} = $newline;
1909 }
1910 else{
1911 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1912 $fhs[$index]->{last_seq_id} = undef;
1913 $fhs[$index]->{last_line} = undef;
1914 last; # break free in case we have reached the end of the alignment output
1915 }
1916 }
1917 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
1918 }
1919 }
1920 else{ # there is no second best hit, so we can just store this one and read in the next sequence
1921
1922 my $alignment_location = join (":",$chromosome,$position);
1923
1924 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1925 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
1926 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
1927 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
1928
1929 unless (exists $alignments{$alignment_location}){
1930 $alignments{$alignment_location}->{seq_id} = $id;
1931 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
1932 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
1933 $alignments{$alignment_location}->{index} = $index;
1934 $alignments{$alignment_location}->{chromosome} = $chromosome;
1935 $alignments{$alignment_location}->{position} = $position;
1936 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
1937 $alignments{$alignment_location}->{CIGAR} = $cigar;
1938 }
1939
1940 my $newline = $fhs[$index]->{fh}-> getline();
1941 if ($newline){
1942 chomp $newline;
1943 my ($seq_id) = split (/\t/,$newline);
1944 $fhs[$index]->{last_seq_id} = $seq_id;
1945 $fhs[$index]->{last_line} = $newline;
1946 if ($seq_id eq $identifier){
1947 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
1948 }
1949 }
1950 else{
1951 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1952 $fhs[$index]->{last_seq_id} = undef;
1953 $fhs[$index]->{last_line} = undef;
1954 }
1955 }
1956 }
1957 }
1958
1959 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
1960 if ($alignment_ambiguous == 1){
1961 $counting{unsuitable_sequence_count}++;
1962 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
1963 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
1964 # print "$ambiguous_read_output\n";
1965
1966 if ($ambiguous){
1967 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
1968 }
1969 elsif ($unmapped){
1970 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
1971 }
1972 else{
1973 return 0;
1974 }
1975 }
1976
1977 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
1978 unless(%alignments){
1979 $counting{no_single_alignment_found}++;
1980 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
1981 # print "$unmapped_read_output\n";
1982 if ($unmapped){
1983 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
1984 }
1985 else{
1986 return 0; # default
1987 }
1988 }
1989
1990 #######################################################################################################################################################
1991
1992 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
1993 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
1994 ### alignment score we are discarding the sequence altogether.
1995 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
1996 ### opening (5) and extending (3 per bp) the gap.
1997
1998 #######################################################################################################################################################
1999
2000 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
2001 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2002
2003 ### print contents of %alignments for debugging
2004 # if (scalar keys %alignments > 1){
2005 # print "\n******\n";
2006 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
2007 # print "Loc: $alignment_location\n";
2008 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
2009 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
2010 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
2011 # print "Index $alignments{$alignment_location}->{index}\n";
2012 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
2013 # print "pos: $alignments{$alignment_location}->{position}\n";
2014 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
2015 # }
2016 # print "\n******\n";
2017 # }
2018
2019 ### if there is only 1 entry in the hash with we accept it as the best alignment
2020 if (scalar keys %alignments == 1){
2021 for my $unique_best_alignment (keys %alignments){
2022 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
2023 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
2024 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
2025 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
2026 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
2027 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
2028 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
2029 }
2030 }
2031
2032 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
2033 ### we boot the sequence altogether
2034 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
2035 my $best_alignment_score;
2036 my $best_alignment_location;
2037 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
2038 # print "$alignments{$alignment_location}->{alignment_score}\n";
2039 unless (defined $best_alignment_score){
2040 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
2041 $best_alignment_location = $alignment_location;
2042 # print "setting best alignment score: $best_alignment_score\n";
2043 }
2044 else{
2045 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
2046 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
2047 # warn "Same alignment score, the sequence will get booted!\n";
2048 $sequence_fails = 1;
2049 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
2050 }
2051 ### else we are going to store the best alignment for further processing
2052 else{
2053 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
2054 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
2055 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
2056 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
2057 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
2058 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
2059 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
2060 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
2061 }
2062 }
2063 }
2064 }
2065 else{
2066 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
2067 }
2068
2069 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
2070 if ($sequence_fails == 1){
2071 $counting{unsuitable_sequence_count}++;
2072
2073 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2074 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
2075 # print OUT "$ambiguous_read_output\n";
2076
2077 if ($ambiguous){
2078 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
2079 }
2080 elsif ($unmapped){
2081 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
2082 }
2083 else{
2084 return 0; # => exits to next sequence (default)
2085 }
2086 }
2087
2088 ### --DIRECTIONAL
2089 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2090 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2091 if ($directional){
2092 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
2093 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2094 $counting{alignments_rejected_count}++;
2095 return 0;
2096 }
2097 }
2098
2099 ### If the sequence has not been rejected so far it has a unique best alignment
2100 $counting{unique_best_alignment_count}++;
2101
2102 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
2103 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
2104
2105 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
2106 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
2107 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
2108 $counting{genomic_sequence_could_not_be_extracted_count}++;
2109 return 0;
2110 }
2111
2112
2113 ### otherwise we are set to perform the actual methylation call
2114 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
2115 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
2116 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
2117 }
2118
2119
2120 sub determine_number_of_transliterations_performed{
2121 my ($sequence,$read_conversion) = @_;
2122 my $number_of_transliterations;
2123 if ($read_conversion eq 'CT'){
2124 $number_of_transliterations = $sequence =~ tr/C/T/;
2125 }
2126 elsif ($read_conversion eq 'GA'){
2127 $number_of_transliterations = $sequence =~ tr/G/A/;
2128 }
2129 else{
2130 die "Read conversion mode of the read was not specified $!\n";
2131 }
2132 return $number_of_transliterations;
2133 }
2134
2135 sub decide_whether_single_end_alignment_is_valid{
2136 my ($index,$identifier) = @_;
2137
2138 # extracting from Bowtie 1 format
2139 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
2140
2141 ### ensuring that the entry is the correct sequence
2142 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
2143 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
2144 ### sensible alignments
2145 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
2146 ### If the orientation was correct can we move on
2147 if ($orientation == 1){
2148 return 1; ### 1st possibility for a sequence to pass
2149 }
2150 ### If the alignment was in the wrong orientation we need to read in a new line
2151 elsif($orientation == 0){
2152 my $newline = $fhs[$index]->{fh}->getline();
2153 if ($newline){
2154 ($id,$strand) = (split (/\t/,$newline))[0,1];
2155
2156 ### ensuring that the next entry is still the correct sequence
2157 if ($id eq $identifier){
2158 ### checking orientation again
2159 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
2160 ### If the orientation was correct can we move on
2161 if ($orientation == 1){
2162 $fhs[$index]->{last_seq_id} = $id;
2163 $fhs[$index]->{last_line} = $newline;
2164 return 1; ### 2nd possibility for a sequence to pass
2165 }
2166 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
2167 elsif ($orientation == 0){
2168 $newline = $fhs[$index]->{fh}->getline();
2169 if ($newline){
2170 my ($seq_id) = split (/\t/,$newline);
2171 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
2172 ### the same fields of the just read next entry
2173 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
2174 $fhs[$index]->{last_seq_id} = $seq_id;
2175 $fhs[$index]->{last_line} = $newline;
2176 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
2177 }
2178 else{
2179 # assigning undef to last_seq_id and last_line (end of bowtie output)
2180 $fhs[$index]->{last_seq_id} = undef;
2181 $fhs[$index]->{last_line} = undef;
2182 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
2183 }
2184 }
2185 else{
2186 die "The orientation of the alignment must be either correct or incorrect\n";
2187 }
2188 }
2189 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
2190 else{
2191 $fhs[$index]->{last_seq_id} = $id;
2192 $fhs[$index]->{last_line} = $newline;
2193 return 0; # processing the new alignment result only in the next round
2194 }
2195 }
2196 else {
2197 # assigning undef to last_seq_id and last_line (end of bowtie output)
2198 $fhs[$index]->{last_seq_id} = undef;
2199 $fhs[$index]->{last_line} = undef;
2200 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
2201 }
2202 }
2203 else{
2204 die "The orientation of the alignment must be either correct or incorrect\n";
2205 }
2206 }
2207 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
2208 else{
2209 return 0;
2210 }
2211 }
2212 #########################
2213 ### BOWTIE 1 | PAIRED-END
2214 #########################
2215
2216 sub check_bowtie_results_paired_ends{
2217 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
2218
2219 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
2220 unless ($quality_value_1){
2221 $quality_value_1 = 'I'x(length$sequence_1);
2222 }
2223 unless ($quality_value_2){
2224 $quality_value_2 = 'I'x(length$sequence_2);
2225 }
2226
2227 # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
2228 # sleep (1);
2229 my %mismatches = ();
2230 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
2231
2232
2233 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
2234 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
2235 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
2236 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
2237 ### strands are not being reported by specifying --directional
2238
2239 foreach my $index (0,3,1,2){
2240 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2241 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
2242 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
2243 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2244 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
2245
2246 ##################################################################################
2247 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
2248 ##################################################################################
2249 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
2250 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
2251 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
2252 if ($valid_alignment_found == 1){
2253 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
2254 ### we store the useful information in %mismatches
2255 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
2256 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
2257 chomp $mismatch_info_1;
2258 chomp $mismatch_info_2;
2259
2260 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
2261 my ($chromosome_1,$chromosome_2);
2262 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2263 $chromosome_1 = $mapped_chromosome_1;
2264 }
2265 else{
2266 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2267 }
2268 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2269 $chromosome_2 = $mapped_chromosome_2;
2270 }
2271 else{
2272 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2273 }
2274
2275 ### Now extracting the number of mismatches to the converted genome
2276 my $number_of_mismatches_1;
2277 my $number_of_mismatches_2;
2278 if ($mismatch_info_1 eq ''){
2279 $number_of_mismatches_1 = 0;
2280 }
2281 elsif ($mismatch_info_1 =~ /^\d/){
2282 my @mismatches = split (/,/,$mismatch_info_1);
2283 $number_of_mismatches_1 = scalar @mismatches;
2284 }
2285 else{
2286 die "Something weird is going on with the mismatch field\n";
2287 }
2288 if ($mismatch_info_2 eq ''){
2289 $number_of_mismatches_2 = 0;
2290 }
2291 elsif ($mismatch_info_2 =~ /^\d/){
2292 my @mismatches = split (/,/,$mismatch_info_2);
2293 $number_of_mismatches_2 = scalar @mismatches;
2294 }
2295 else{
2296 die "Something weird is going on with the mismatch field\n";
2297 }
2298 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
2299 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
2300 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2301 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
2302 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2303 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2304 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2305 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
2306 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
2307 ### number for the found alignment)
2308 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
2309 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
2310 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
2311 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
2312 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
2313 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
2314 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
2315 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
2316 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
2317 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
2318 }
2319 ###################################################################################################################################################
2320 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
2321 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
2322 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
2323 ### this round ###
2324 ###################################################################################################################################################
2325 my $newline_1 = $fhs[$index]->{fh}-> getline();
2326 my $newline_2 = $fhs[$index]->{fh}-> getline();
2327
2328 if ($newline_1 and $newline_2){
2329 my ($seq_id_1) = split (/\t/,$newline_1);
2330 my ($seq_id_2) = split (/\t/,$newline_2);
2331
2332 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2333 $fhs[$index]->{last_seq_id} = $seq_id_1;
2334 }
2335 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2336 $fhs[$index]->{last_seq_id} = $seq_id_2;
2337 }
2338 else{
2339 die "Either read 1 or read 2 needs to end on '/1'\n";
2340 }
2341
2342 $fhs[$index]->{last_line_1} = $newline_1;
2343 $fhs[$index]->{last_line_2} = $newline_2;
2344 }
2345 else {
2346 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
2347 $fhs[$index]->{last_seq_id} = undef;
2348 $fhs[$index]->{last_line_1} = undef;
2349 $fhs[$index]->{last_line_2} = undef;
2350 next; # jumping to the next index
2351 }
2352 ### Now processing the entry we just stored in last_line_1 and last_line_2
2353 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
2354 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
2355 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
2356 if ($valid_alignment_found == 1){
2357 ### we store the useful information in %mismatches
2358 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
2359 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
2360 chomp $mismatch_info_1;
2361 chomp $mismatch_info_2;
2362 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
2363 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2364 $chromosome_1 = $mapped_chromosome_1;
2365 }
2366 else{
2367 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2368 }
2369 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2370 $chromosome_2 = $mapped_chromosome_2;
2371 }
2372 else{
2373 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2374 }
2375
2376 $number_of_mismatches_1='';
2377 $number_of_mismatches_2='';
2378 ### Now extracting the number of mismatches to the converted genome
2379 if ($mismatch_info_1 eq ''){
2380 $number_of_mismatches_1 = 0;
2381 }
2382 elsif ($mismatch_info_1 =~ /^\d/){
2383 my @mismatches = split (/,/,$mismatch_info_1);
2384 $number_of_mismatches_1 = scalar @mismatches;
2385 }
2386 else{
2387 die "Something weird is going on with the mismatch field\n";
2388 }
2389 if ($mismatch_info_2 eq ''){
2390 $number_of_mismatches_2 = 0;
2391 }
2392 elsif ($mismatch_info_2 =~ /^\d/){
2393 my @mismatches = split (/,/,$mismatch_info_2);
2394 $number_of_mismatches_2 = scalar @mismatches;
2395 }
2396 else{
2397 die "Something weird is going on with the mismatch field\n";
2398 }
2399 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
2400 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
2401 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2402 die "position 1 is greater than position 2" if ($position_1 > $position_2);
2403 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2404 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2405 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2406 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
2407 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
2408 ### number for the found alignment)
2409 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
2410 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
2411 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
2412 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
2413 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
2414 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
2415 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
2416 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
2417 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
2418 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
2419 }
2420 ###############################################################################################################################################
2421 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
2422 ###############################################################################################################################################
2423 $newline_1 = $fhs[$index]->{fh}-> getline();
2424 $newline_2 = $fhs[$index]->{fh}-> getline();
2425
2426 if ($newline_1 and $newline_2){
2427 my ($seq_id_1) = split (/\t/,$newline_1);
2428 my ($seq_id_2) = split (/\t/,$newline_2);
2429
2430 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2431 $fhs[$index]->{last_seq_id} = $seq_id_1;
2432 }
2433 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2434 $fhs[$index]->{last_seq_id} = $seq_id_2;
2435 }
2436 $fhs[$index]->{last_line_1} = $newline_1;
2437 $fhs[$index]->{last_line_2} = $newline_2;
2438 }
2439 else {
2440 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
2441 $fhs[$index]->{last_seq_id} = undef;
2442 $fhs[$index]->{last_line_1} = undef;
2443 $fhs[$index]->{last_line_2} = undef;
2444 next; # jumping to the next index
2445 }
2446 ### within the 2nd sequence pair alignment in correct orientation found
2447 }
2448 ### within the 1st sequence pair alignment in correct orientation found
2449 }
2450 ### still within the (last_seq_id eq identifier) condition
2451 }
2452 ### still within foreach index loop
2453 }
2454 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
2455 unless(%mismatches){
2456 $counting{no_single_alignment_found}++;
2457 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
2458 }
2459 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2460 my $sequence_pair_fails = 0;
2461 ### Declaring an empty hash reference which will store all information we need for the methylation call
2462 my $methylation_call_params; # hash reference!
2463 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
2464 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
2465 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
2466 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
2467 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
2468 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
2469 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
2470 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
2471 }
2472 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
2473 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
2474 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
2475 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
2476 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
2477 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
2478 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
2479 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
2480 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
2481 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
2482 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
2483 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
2484 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
2485 }
2486 }
2487 else{
2488 $sequence_pair_fails = 1;
2489 }
2490 ### after processing the alignment with the lowest number of mismatches we exit
2491 last;
2492 }
2493 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
2494 if ($sequence_pair_fails == 1){
2495 $counting{unsuitable_sequence_count}++;
2496 if ($ambiguous){
2497 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
2498 }
2499 if ($unmapped){
2500 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
2501 }
2502 else{
2503 return 0; # => exits to next sequence (default)
2504 }
2505 }
2506
2507 ### --DIRECTIONAL
2508 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2509 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2510 if ($directional){
2511 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
2512 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2513 $counting{alignments_rejected_count}++;
2514 return 0;
2515 }
2516 }
2517
2518 ### If the sequence has not been rejected so far it does have a unique best alignment
2519 $counting{unique_best_alignment_count}++;
2520 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
2521
2522 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2523 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2524 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
2525 $counting{genomic_sequence_could_not_be_extracted_count}++;
2526 return 0;
2527 }
2528 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2529 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
2530 $counting{genomic_sequence_could_not_be_extracted_count}++;
2531 return 0;
2532 }
2533
2534 ### otherwise we are set to perform the actual methylation call
2535 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
2536 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
2537
2538 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
2539 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
2540 }
2541
2542 #########################
2543 ### BOWTIE 2 | PAIRED-END
2544 #########################
2545
2546 sub check_bowtie_results_paired_ends_bowtie2{
2547 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
2548
2549 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
2550 unless ($quality_value_1){
2551 $quality_value_1 = 'I'x(length$sequence_1);
2552 }
2553
2554 unless ($quality_value_2){
2555 $quality_value_2 = 'I'x(length$sequence_2);
2556 }
2557
2558
2559 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
2560
2561
2562 my %alignments;
2563 my $alignment_ambiguous = 0;
2564
2565 ### reading from the Bowtie 2 output filehandles
2566
2567 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
2568 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
2569 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
2570 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
2571 ### strands are not being reported when '--directional' is specified
2572
2573 foreach my $index (0,3,1,2){
2574 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2575 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
2576
2577 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
2578 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2579
2580 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
2581 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
2582 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
2583 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
2584 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
2585 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
2586 $id_1 =~ s/\/1$//;
2587 $id_2 =~ s/\/2$//;
2588
2589 # SAM format specifications for Bowtie 2
2590 # (1) Name of read that aligned
2591 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
2592 # 1 The read is one of a pair
2593 # 2 The alignment is one end of a proper paired-end alignment
2594 # 4 The read has no reported alignments
2595 # 8 The read is one of a pair and has no reported alignments
2596 # 16 The alignment is to the reverse reference strand
2597 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
2598 # 64 The read is mate 1 in a pair
2599 # 128 The read is mate 2 in a pair
2600 # 256 The read has multiple mapping states
2601 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
2602 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
2603 # (5) Mapping quality (255 means MAPQ is not available)
2604 # (6) CIGAR string representation of alignment (* if unavailable)
2605 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
2606 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
2607 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
2608 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
2609 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
2610 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
2611 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
2612 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
2613 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
2614 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
2615 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
2616 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2617 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2618 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
2619 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
2620 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
2621
2622 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
2623 ### We can store the next alignment and move on to the next Bowtie 2 instance
2624 if ($flag_1 == 77 and $flag_2 == 141){
2625 ## reading in the next alignment, which must be the next sequence
2626 my $newline_1 = $fhs[$index]->{fh}-> getline();
2627 my $newline_2 = $fhs[$index]->{fh}-> getline();
2628
2629 if ($newline_1 and $newline_2){
2630 chomp $newline_1;
2631 chomp $newline_2;
2632 my ($seq_id_1) = split (/\t/,$newline_1);
2633 my ($seq_id_2) = split (/\t/,$newline_2);
2634 $seq_id_1 =~ s/\/1$//;
2635 $seq_id_2 =~ s/\/2$//;
2636 $fhs[$index]->{last_seq_id} = $seq_id_1;
2637 $fhs[$index]->{last_line_1} = $newline_1;
2638 $fhs[$index]->{last_line_2} = $newline_2;
2639
2640 # print "current sequence ($identifier) did not map, reading in next sequence\n";
2641 # print "$index\t$fhs[$index]->{last_seq_id}\n";
2642 # print "$index\t$fhs[$index]->{last_line_1}\n";
2643 # print "$index\t$fhs[$index]->{last_line_2}\n";
2644 next; # next instance
2645 }
2646 else{
2647 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2648 $fhs[$index]->{last_seq_id} = undef;
2649 $fhs[$index]->{last_line_1} = undef;
2650 $fhs[$index]->{last_line_2} = undef;
2651 next;
2652 }
2653 }
2654
2655 ### If there are one or more proper alignments we can extract the chromosome number
2656 my ($chromosome_1,$chromosome_2);
2657 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2658 $chromosome_1 = $mapped_chromosome_1;
2659 }
2660 else{
2661 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2662 }
2663 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2664 $chromosome_2 = $mapped_chromosome_2;
2665 }
2666 else{
2667 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2668 }
2669
2670 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2671
2672 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
2673 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
2674
2675 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
2676 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
2677
2678 foreach (11..$#fields_1){
2679 if ($fields_1[$_] =~ /AS:i:(.*)/){
2680 $alignment_score_1 = $1;
2681 }
2682 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
2683 $second_best_1 = $1;
2684 }
2685 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
2686 $MD_tag_1 = $1;
2687 }
2688 }
2689
2690 foreach (11..$#fields_2){
2691 if ($fields_2[$_] =~ /AS:i:(.*)/){
2692 $alignment_score_2 = $1;
2693 }
2694 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
2695 $second_best_2 = $1;
2696 }
2697 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
2698 $MD_tag_2 = $1;
2699 }
2700 }
2701
2702 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
2703 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
2704
2705 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
2706 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
2707 # warn "MD tag 1 is: '$MD_tag_1'\n";
2708 # warn "MD tag 2 is: '$MD_tag_2'\n";
2709
2710 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
2711 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
2712 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
2713
2714 if (defined $second_best_1 and defined $second_best_2){
2715 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
2716 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
2717 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
2718 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
2719
2720 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
2721 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
2722 $alignment_ambiguous = 1;
2723 # print "This read will be chucked (AS==XS detected)!\n";
2724
2725 ## need to read and discard all additional ambiguous reads until we reach the next sequence
2726 until ($fhs[$index]->{last_seq_id} ne $identifier){
2727 my $newline_1 = $fhs[$index]->{fh}-> getline();
2728 my $newline_2 = $fhs[$index]->{fh}-> getline();
2729 if ($newline_1 and $newline_2){
2730 chomp $newline_1;
2731 chomp $newline_2;
2732 my ($seq_id_1) = split (/\t/,$newline_1);
2733 my ($seq_id_2) = split (/\t/,$newline_2);
2734 $seq_id_1 =~ s/\/1$//;
2735 $seq_id_2 =~ s/\/2$//;
2736 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2737
2738 $fhs[$index]->{last_seq_id} = $seq_id_1;
2739 $fhs[$index]->{last_line_1} = $newline_1;
2740 $fhs[$index]->{last_line_2} = $newline_2;
2741 }
2742 else{
2743 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2744 $fhs[$index]->{last_seq_id} = undef;
2745 $fhs[$index]->{last_line_1} = undef;
2746 $fhs[$index]->{last_line_2} = undef;
2747 last; # break free if the end of the alignment output was reached
2748 }
2749 }
2750 # if ($fhs[$index]->{last_seq_id}){
2751 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
2752 # }
2753 }
2754 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
2755
2756 my $alignment_location;
2757 if ($position_1 <= $position_2){
2758 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2759 }
2760 elsif($position_2 < $position_1){
2761 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
2762 }
2763
2764 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2765 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2766 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2767 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2768
2769 unless (exists $alignments{$alignment_location}){
2770 $alignments{$alignment_location}->{seq_id} = $id_1;
2771 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2772 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2773 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2774 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2775 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2776 $alignments{$alignment_location}->{index} = $index;
2777 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2778 $alignments{$alignment_location}->{position_1} = $position_1;
2779 $alignments{$alignment_location}->{position_2} = $position_2;
2780 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2781 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2782 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2783 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2784 $alignments{$alignment_location}->{flag_1} = $flag_1;
2785 $alignments{$alignment_location}->{flag_2} = $flag_2;
2786 }
2787 # warn "added best of several alignments to \%alignments hash\n";
2788
2789 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
2790 until ($fhs[$index]->{last_seq_id} ne $identifier){
2791 my $newline_1 = $fhs[$index]->{fh}-> getline();
2792 my $newline_2 = $fhs[$index]->{fh}-> getline();
2793 if ($newline_1 and $newline_2){
2794 chomp $newline_1;
2795 chomp $newline_2;
2796 my ($seq_id_1) = split (/\t/,$newline_1);
2797 my ($seq_id_2) = split (/\t/,$newline_2);
2798 $seq_id_1 =~ s/\/1$//;
2799 $seq_id_2 =~ s/\/2$//;
2800 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2801
2802 $fhs[$index]->{last_seq_id} = $seq_id_1;
2803 $fhs[$index]->{last_line_1} = $newline_1;
2804 $fhs[$index]->{last_line_2} = $newline_2;
2805 }
2806 else{
2807 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2808 $fhs[$index]->{last_seq_id} = undef;
2809 $fhs[$index]->{last_line_1} = undef;
2810 $fhs[$index]->{last_line_2} = undef;
2811 last; # break free if the end of the alignment output was reached
2812 }
2813 }
2814 # if($fhs[$index]->{last_seq_id}){
2815 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
2816 # }
2817 }
2818 }
2819 else{ # there is no second best hit, so we can just store this one and read in the next sequence
2820
2821 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2822 # print "$alignment_location\n";
2823 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2824 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2825 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2826 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2827
2828 unless (exists $alignments{$alignment_location}){
2829 $alignments{$alignment_location}->{seq_id} = $id_1;
2830 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2831 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2832 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2833 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2834 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2835 $alignments{$alignment_location}->{index} = $index;
2836 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2837 $alignments{$alignment_location}->{position_1} = $position_1;
2838 $alignments{$alignment_location}->{position_2} = $position_2;
2839 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2840 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2841 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2842 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2843 $alignments{$alignment_location}->{flag_1} = $flag_1;
2844 $alignments{$alignment_location}->{flag_2} = $flag_2;
2845 }
2846
2847 # warn "added unique alignment to \%alignments hash\n";
2848
2849 # Now reading and storing the next read pair
2850 my $newline_1 = $fhs[$index]->{fh}-> getline();
2851 my $newline_2 = $fhs[$index]->{fh}-> getline();
2852 if ($newline_1 and $newline_2){
2853 chomp $newline_1;
2854 chomp $newline_2;
2855 # print "$newline_1\n";
2856 # print "$newline_2\n";
2857 my ($seq_id_1) = split (/\t/,$newline_1);
2858 my ($seq_id_2) = split (/\t/,$newline_2);
2859 $seq_id_1 =~ s/\/1$//;
2860 $seq_id_2 =~ s/\/2$//;
2861 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2862
2863 $fhs[$index]->{last_seq_id} = $seq_id_1;
2864 $fhs[$index]->{last_line_1} = $newline_1;
2865 $fhs[$index]->{last_line_2} = $newline_2;
2866
2867 if ($seq_id_1 eq $identifier){
2868 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
2869 }
2870 }
2871 else{
2872 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2873 $fhs[$index]->{last_seq_id} = undef;
2874 $fhs[$index]->{last_line_1} = undef;
2875 $fhs[$index]->{last_line_2} = undef;
2876 }
2877 }
2878 }
2879 }
2880
2881 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
2882 if ($alignment_ambiguous == 1){
2883 $counting{unsuitable_sequence_count}++;
2884 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2885 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2886 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2887 # print "$ambiguous_read_1\n";
2888 # print "$ambiguous_read_2\n";
2889
2890 if ($ambiguous){
2891 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
2892 }
2893 elsif ($unmapped){
2894 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
2895 }
2896 else{
2897 return 0;
2898 }
2899 }
2900
2901 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
2902 unless (%alignments){
2903 $counting{no_single_alignment_found}++;
2904
2905 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2906 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2907 # print "$unmapped_read_1\n";
2908 # print "$unmapped_read_2\n";
2909 if ($unmapped){
2910 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
2911 }
2912 else{
2913 return 0;
2914 }
2915 }
2916
2917 #######################################################################################################################################################
2918
2919 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
2920 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
2921 ### alignment score we are discarding the sequence pair altogether.
2922 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
2923 ### and extending (3 per bp) the gap.
2924
2925 #######################################################################################################################################################
2926
2927 ### Declaring an empty hash reference which will store all information we need for the methylation call
2928 my $methylation_call_params; # hash reference
2929 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2930
2931 ### print contents of %alignments for debugging
2932 ## if (scalar keys %alignments >= 1){
2933 # print "\n******\n";
2934 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
2935 # print "Loc: $alignment_location\n";
2936 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
2937 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
2938 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
2939 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
2940 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
2941 # print "Index $alignments{$alignment_location}->{index}\n";
2942 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
2943 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
2944 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
2945 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
2946 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
2947 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
2948 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
2949 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
2950 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
2951 # }
2952 # print "\n******\n";
2953 # }
2954
2955 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
2956 if (scalar keys %alignments == 1){
2957 for my $unique_best_alignment (keys %alignments){
2958 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
2959 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
2960 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
2961 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
2962 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
2963 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
2964 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
2965 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
2966 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
2967 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
2968 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
2969 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
2970 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
2971 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
2972 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
2973 }
2974 }
2975
2976 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
2977 ### we boot the sequence pair altogether)
2978 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
2979 my $best_sum_of_alignment_scores;
2980 my $best_alignment_location;
2981 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
2982 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
2983 unless (defined $best_sum_of_alignment_scores){
2984 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
2985 $best_alignment_location = $alignment_location;
2986 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
2987 }
2988 else{
2989 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
2990 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
2991 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
2992 $sequence_pair_fails = 1;
2993 last; # exiting since we know that the sequence has ambiguous alignments
2994 }
2995 ### else we are going to store the best alignment for further processing
2996 else{
2997 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
2998 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
2999 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
3000 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
3001 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
3002 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
3003 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
3004 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
3005 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
3006 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
3007 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
3008 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
3009 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
3010 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
3011 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
3012 last; # exiting since the sequence produced a unique best alignment
3013 }
3014 }
3015 }
3016 }
3017 else{
3018 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
3019 }
3020
3021 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
3022 if ($sequence_pair_fails == 1){
3023 $counting{unsuitable_sequence_count}++;
3024
3025 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
3026 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
3027 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
3028 # print "$ambiguous_read_1\n";
3029 # print "$ambiguous_read_2\n";
3030
3031 if ($ambiguous){
3032 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
3033 }
3034 elsif ($unmapped){
3035 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
3036 }
3037 else{
3038 return 0; # => exits to next sequence pair (default)
3039 }
3040 }
3041
3042 ### --DIRECTIONAL
3043 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
3044 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
3045 if ($directional){
3046 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
3047 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
3048 $counting{alignments_rejected_count}++;
3049 return 0;
3050 }
3051 }
3052
3053 ### If the sequence pair has not been rejected so far it does have a unique best alignment
3054 $counting{unique_best_alignment_count}++;
3055 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
3056
3057 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
3058 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
3059 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n";
3060 $counting{genomic_sequence_could_not_be_extracted_count}++;
3061 return 0;
3062 }
3063 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
3064 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n";
3065 $counting{genomic_sequence_could_not_be_extracted_count}++;
3066 return 0;
3067 }
3068
3069 ### now we are set to perform the actual methylation call
3070 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
3071 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
3072 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
3073 # print " $sequence_2\n";
3074 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
3075 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
3076
3077 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
3078 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
3079 }
3080
3081 ###
3082
3083 sub decide_whether_paired_end_alignment_is_valid{
3084 my ($index,$identifier) = @_;
3085 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
3086 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
3087 chomp $mismatch_info_1;
3088 chomp $mismatch_info_2;
3089 my $seq_id_1 = $id_1;
3090 my $seq_id_2 = $id_2;
3091 $seq_id_1 =~ s/\/1$//; # removing the read /1
3092 $seq_id_2 =~ s/\/1$//; # removing the read /1
3093
3094 ### ensuring that the current entry is the correct sequence
3095 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
3096 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
3097 ### sensible alignments
3098 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
3099 ### If the orientation was correct can we move on
3100 if ($orientation == 1){
3101 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
3102 }
3103 ### If the alignment was in the wrong orientation we need to read in two new lines
3104 elsif($orientation == 0){
3105 my $newline_1 = $fhs[$index]->{fh}->getline();
3106 my $newline_2 = $fhs[$index]->{fh}->getline();
3107 if ($newline_1 and $newline_2){
3108 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
3109 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
3110 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
3111
3112 my $seqid;
3113 $seq_id_1 = $id_1;
3114 $seq_id_2 = $id_2;
3115 # we need to capture the first read (ending on /1)
3116 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
3117 $seqid = $seq_id_1;
3118 }
3119 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
3120 $seqid = $seq_id_2;
3121 }
3122 else{
3123 die "One of the two reads needs to end on /1!!";
3124 }
3125
3126 ### ensuring that the next entry is still the correct sequence
3127 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
3128 ### checking orientation again
3129 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
3130 ### If the orientation was correct can we move on
3131 if ($orientation == 1){
3132 ### Writing the current sequence to last_line_1 and last_line_2
3133 $fhs[$index]->{last_seq_id} = $seqid;
3134 $fhs[$index]->{last_line_1} = $newline_1;
3135 $fhs[$index]->{last_line_2} = $newline_2;
3136 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
3137 }
3138 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
3139 ### the next entry)
3140 elsif ($orientation == 0){
3141 $newline_1 = $fhs[$index]->{fh}->getline();
3142 $newline_2 = $fhs[$index]->{fh}->getline();
3143 if ($newline_1 and $newline_2){
3144 ($seq_id_1) = split (/\t/,$newline_1);
3145 ($seq_id_2) = split (/\t/,$newline_2);
3146
3147 $seqid = '';
3148 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
3149 $seqid = $seq_id_1;
3150 }
3151 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
3152 $seqid = $seq_id_2;
3153 }
3154 else{
3155 die "One of the two reads needs to end on /1!!";
3156 }
3157
3158 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
3159 ### the same fields of the just read next entry
3160 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
3161 $fhs[$index]->{last_seq_id} = $seqid;
3162 $fhs[$index]->{last_line_1} = $newline_1;
3163 $fhs[$index]->{last_line_2} = $newline_2;
3164 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
3165 }
3166 else {
3167 ### assigning undef to last_seq_id and last_line (end of bowtie output)
3168 $fhs[$index]->{last_seq_id} = undef;
3169 $fhs[$index]->{last_line_1} = undef;
3170 $fhs[$index]->{last_line_2} = undef;
3171 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
3172 }
3173 }
3174 else{
3175 die "The orientation of the alignment must be either correct or incorrect\n";
3176 }
3177 }
3178 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
3179 else{
3180 $fhs[$index]->{last_seq_id} = $seqid;
3181 $fhs[$index]->{last_line_1} = $newline_1;
3182 $fhs[$index]->{last_line_2} = $newline_2;
3183 return 0; # processing the new alignment result only in the next round
3184 }
3185 }
3186 else {
3187 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
3188 $fhs[$index]->{last_seq_id} = undef;
3189 $fhs[$index]->{last_line_1} = undef;
3190 $fhs[$index]->{last_line_2} = undef;
3191 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
3192 }
3193 }
3194 else{
3195 die "The orientation of the alignment must be either correct or incorrect\n";
3196 }
3197 }
3198 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
3199 else{
3200 return 0;
3201 }
3202 }
3203
3204 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
3205
3206 sub extract_corresponding_genomic_sequence_paired_ends {
3207 my ($sequence_identifier,$methylation_call_params) = @_;
3208 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
3209 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3210 my $alignment_read_1;
3211 my $alignment_read_2;
3212 my $read_conversion_info_1;
3213 my $read_conversion_info_2;
3214 my $genome_conversion;
3215
3216 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
3217 ### if the C happens to be at the first or last position of the actually observed sequence
3218 my $non_bisulfite_sequence_1;
3219 my $non_bisulfite_sequence_2;
3220
3221 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
3222 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
3223 ### sequences around!
3224 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
3225 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3226 ### [Index 0, sequence originated from (converted) forward strand]
3227 $counting{CT_GA_CT_count}++;
3228 $alignment_read_1 = '+';
3229 $alignment_read_2 = '-';
3230 $read_conversion_info_1 = 'CT';
3231 $read_conversion_info_2 = 'GA';
3232 $genome_conversion = 'CT';
3233 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
3234 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
3235
3236 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
3237
3238 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
3239 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
3240 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
3241
3242 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
3243 ### the reverse strand sequence needs to be reverse complemented
3244 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3245 }
3246 else{
3247 $non_bisulfite_sequence_2 = '';
3248 }
3249 }
3250
3251 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
3252 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3253 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
3254 $counting{GA_CT_GA_count}++;
3255 $alignment_read_1 = '+';
3256 $alignment_read_2 = '-';
3257 $read_conversion_info_1 = 'GA';
3258 $read_conversion_info_2 = 'CT';
3259 $genome_conversion = 'GA';
3260
3261 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
3262 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
3263 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
3264 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
3265 }
3266 else{
3267 $non_bisulfite_sequence_1 = '';
3268 }
3269
3270 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
3271 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
3272 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
3273 ### the reverse strand sequence needs to be reverse complemented
3274 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3275 }
3276
3277 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
3278 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3279 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
3280 $counting{GA_CT_CT_count}++;
3281 $alignment_read_1 = '-';
3282 $alignment_read_2 = '+';
3283 $read_conversion_info_1 = 'GA';
3284 $read_conversion_info_2 = 'CT';
3285 $genome_conversion = 'CT';
3286
3287 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
3288 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
3289 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
3290 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
3291 ### the reverse strand sequence needs to be reverse complemented
3292 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3293
3294 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
3295 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
3296 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
3297 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
3298 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
3299 }
3300 else{
3301 $non_bisulfite_sequence_2 = '';
3302 }
3303 }
3304
3305 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
3306 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3307 ### [Index 3, sequence originated from the (converted) reverse strand]
3308 $counting{CT_GA_GA_count}++;
3309 $alignment_read_1 = '-';
3310 $alignment_read_2 = '+';
3311 $read_conversion_info_1 = 'CT';
3312 $read_conversion_info_2 = 'GA';
3313 $genome_conversion = 'GA';
3314
3315 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
3316 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
3317 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
3318 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
3319 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
3320 ### the reverse strand sequence needs to be reverse complemented
3321 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3322 }
3323 else{
3324 $non_bisulfite_sequence_1 = '';
3325 }
3326
3327 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
3328 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
3329 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
3330 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
3331 }
3332 else{
3333 die "Too many bowtie result filehandles\n";
3334 }
3335 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3336 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3337
3338 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
3339 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
3340 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3341 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
3342 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
3343 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3344 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3345 }
3346
3347 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
3348
3349 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
3350 my ($sequence_identifier,$methylation_call_params) = @_;
3351 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
3352 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3353
3354 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
3355 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
3356 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
3357 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
3358 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
3359 # sleep(10);
3360 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
3361 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
3362
3363 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3364 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3365 my $alignment_read_1;
3366 my $alignment_read_2;
3367 my $read_conversion_info_1;
3368 my $read_conversion_info_2;
3369 my $genome_conversion;
3370
3371 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
3372 ### if the C happens to be at the last position of the actually observed sequence
3373 my $non_bisulfite_sequence_1 = '';
3374 my $non_bisulfite_sequence_2 = '';
3375
3376 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
3377 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
3378 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
3379
3380 # parsing CIGAR 1 string
3381 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
3382 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
3383 shift @ops_1; # remove the empty first element
3384 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
3385 # parsing CIGAR 2 string
3386 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
3387 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
3388 shift @ops_2; # remove the empty first element
3389 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
3390
3391 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
3392 my $indels_2 = 0;
3393
3394 ### Extracting read 1 genomic sequence ###
3395
3396 # extracting 2 additional bp at the 5' end (read 1)
3397 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3398 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3399 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
3400 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3401 return;
3402 }
3403 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
3404 }
3405
3406 foreach (0..$#len_1){
3407 if ($ops_1[$_] eq 'M'){
3408 # extracting genomic sequence
3409 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
3410 # warn "$non_bisulfite_sequence_1\n";
3411 # adjusting position
3412 $pos_1 += $len_1[$_];
3413 }
3414 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
3415 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3416 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
3417 # warn "$non_bisulfite_sequence_1\n";
3418 # position doesn't need adjusting
3419 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3420 }
3421 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
3422 # we do not add any genomic sequence but only adjust the position
3423 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
3424 $pos_1 += $len_1[$_];
3425 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3426 }
3427 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
3428 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
3429 }
3430 else{
3431 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
3432 }
3433 }
3434
3435 ### 3' end of read 1
3436 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3437 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3438 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
3439 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3440 return;
3441 }
3442
3443 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
3444 }
3445
3446
3447 ### Extracting read 2 genomic sequence ###
3448
3449 ### 5' end of read 2
3450 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3451 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3452 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
3453 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3454 return;
3455 }
3456 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
3457 }
3458
3459 foreach (0..$#len_2){
3460 if ($ops_2[$_] eq 'M'){
3461 # extracting genomic sequence
3462 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
3463 # warn "$non_bisulfite_sequence_2\n";
3464 # adjusting position
3465 $pos_2 += $len_2[$_];
3466 }
3467 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
3468 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3469 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
3470 # warn "$non_bisulfite_sequence_2\n";
3471 # position doesn't need adjusting
3472 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3473 }
3474 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
3475 # we do not add any genomic sequence but only adjust the position
3476 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
3477 $pos_2 += $len_2[$_];
3478 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3479 }
3480 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
3481 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
3482 }
3483 else{
3484 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
3485 }
3486 }
3487
3488 ### 3' end of read 2
3489 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3490 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3491 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
3492 # need to set read 1 as well now to prevent warning
3493 # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n";
3494 # sleep(5);
3495 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3496 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3497 return;
3498 }
3499 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
3500 }
3501
3502 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
3503 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
3504
3505 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
3506 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3507 ### [Index 0, sequence originated from (converted) forward strand]
3508 $counting{CT_GA_CT_count}++;
3509 $alignment_read_1 = '+';
3510 $alignment_read_2 = '-';
3511 $read_conversion_info_1 = 'CT';
3512 $read_conversion_info_2 = 'GA';
3513 $genome_conversion = 'CT';
3514 ### Read 1 is always the forward hit
3515 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
3516 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3517 }
3518
3519 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
3520 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3521 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
3522 $counting{GA_CT_GA_count}++;
3523 $alignment_read_1 = '+';
3524 $alignment_read_2 = '-';
3525 $read_conversion_info_1 = 'GA';
3526 $read_conversion_info_2 = 'CT';
3527 $genome_conversion = 'GA';
3528 ### Read 1 is always the forward hit
3529 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
3530 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3531 }
3532
3533 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
3534 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3535 ### [Index 2, sequence originated from the complementary to (converted) top strand]
3536 $counting{GA_CT_CT_count}++;
3537 $alignment_read_1 = '-';
3538 $alignment_read_2 = '+';
3539 $read_conversion_info_1 = 'GA';
3540 $read_conversion_info_2 = 'CT';
3541 $genome_conversion = 'CT';
3542
3543 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
3544 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3545 }
3546
3547 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
3548 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3549 ### [Index 3, sequence originated from the (converted) reverse strand]
3550 $counting{CT_GA_GA_count}++;
3551 $alignment_read_1 = '-';
3552 $alignment_read_2 = '+';
3553 $read_conversion_info_1 = 'CT';
3554 $read_conversion_info_2 = 'GA';
3555 $genome_conversion = 'GA';
3556 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
3557 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3558 }
3559 else{
3560 die "Too many bowtie result filehandles\n";
3561 }
3562 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3563 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3564
3565 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
3566 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
3567 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3568 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
3569 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
3570 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3571 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3572 ## the end position of a read is stored in $pos
3573 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
3574 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
3575 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
3576 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
3577 }
3578
3579 ##########################################
3580 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
3581 ##########################################
3582
3583 sub print_bisulfite_mapping_result_single_end{
3584 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
3585
3586 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3587 if ($phred64){
3588 $quality_value = convert_phred64_quals_to_phred33($quality_value);
3589 }
3590 elsif ($solexa){
3591 $quality_value = convert_solexa_quals_to_phred33($quality_value);
3592 }
3593
3594 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
3595 $methylation_call_params->{$identifier}->{position} += 1;
3596
3597 ### writing every uniquely mapped read and its methylation call to the output file
3598 if ($vanilla){
3599 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
3600 print OUT "$bowtie1_output\n";
3601 }
3602 else{ # SAM output, default since Bismark v1.0.0
3603 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
3604 }
3605 }
3606
3607 ##########################################
3608 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
3609 ##########################################
3610
3611 sub print_bisulfite_mapping_result_single_end_bowtie2{
3612 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
3613
3614 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3615 if ($phred64){
3616 $quality_value = convert_phred64_quals_to_phred33($quality_value);
3617 }
3618 elsif ($solexa){
3619 $quality_value = convert_solexa_quals_to_phred33($quality_value);
3620 }
3621
3622 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
3623 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
3624 }
3625
3626 ##########################################
3627 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
3628 ##########################################
3629
3630 sub print_bisulfite_mapping_results_paired_ends{
3631 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
3632
3633 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3634 if ($phred64){
3635 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
3636 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
3637 }
3638 elsif ($solexa){
3639 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
3640 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
3641 }
3642
3643 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
3644 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
3645
3646 ### writing every single aligned read and its methylation call to the output file
3647 if ($vanilla){
3648 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
3649 print OUT "$bowtie1_output_paired_end\n";
3650 }
3651 else{ # SAM output, default since Bismark v1.0.0
3652 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
3653 }
3654
3655 }
3656
3657 ##########################################
3658 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
3659 ##########################################
3660
3661 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
3662 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
3663
3664 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3665 if ($phred64){
3666 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
3667 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
3668 }
3669 elsif ($solexa){
3670 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
3671 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
3672 }
3673
3674 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
3675 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
3676
3677 }
3678
3679
3680 sub convert_phred64_quals_to_phred33{
3681
3682 my $qual = shift;
3683 my @quals = split (//,$qual);
3684 my @new_quals;
3685
3686 foreach my $index (0..$#quals){
3687 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
3688 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
3689 $new_quals[$index] = $phred33_quality_string;
3690 }
3691
3692 my $phred33_quality = join ("",@new_quals);
3693 return $phred33_quality;
3694 }
3695
3696 sub convert_solexa_quals_to_phred33{
3697
3698 my $qual = shift;
3699 my @quals = split (//,$qual);
3700 my @new_quals;
3701
3702 foreach my $index (0..$#quals){
3703 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
3704 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
3705 $new_quals[$index] = $phred33_quality_string;
3706 }
3707
3708 my $phred33_quality = join ("",@new_quals);
3709 return $phred33_quality;
3710 }
3711
3712 sub convert_phred_score_into_phred33_quality_string{
3713 my $qual = shift;
3714 $qual = chr($qual+33);
3715 return $qual;
3716 }
3717
3718 sub convert_phred64_quality_string_into_phred_score{
3719 my $string = shift;
3720 my $qual = ord($string)-64;
3721 return $qual;
3722 }
3723
3724 sub convert_solexa_pre1_3_quality_string_into_phred_score{
3725 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
3726 my $string = shift;
3727 my $qual = ord($string)-59;
3728 return $qual;
3729 }
3730
3731
3732 sub extract_corresponding_genomic_sequence_single_end {
3733 my ($sequence_identifier,$methylation_call_params) = @_;
3734 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3735 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3736
3737 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3738 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3739 my $alignment_strand;
3740 my $read_conversion_info;
3741 my $genome_conversion;
3742 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
3743 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
3744 ### if the C happens to be at the last position of the actually observed sequence
3745 my $non_bisulfite_sequence;
3746 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
3747
3748 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
3749 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3750 ### [Index 0, sequence originated from (converted) forward strand]
3751 $counting{CT_CT_count}++;
3752 $alignment_strand = '+';
3753 $read_conversion_info = 'CT';
3754 $genome_conversion = 'CT';
3755
3756 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3757 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
3758 ### + 2 extra base at the 3' end
3759 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3760 }
3761 else{
3762 $non_bisulfite_sequence = '';
3763 }
3764 }
3765
3766 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
3767 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3768 ### [Index 1, sequence originated from (converted) reverse strand]
3769 $counting{CT_GA_count}++;
3770 $alignment_strand = '-';
3771 $read_conversion_info = 'CT';
3772 $genome_conversion = 'GA';
3773
3774 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3775 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
3776 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
3777 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3778 ## reverse complement!
3779 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3780 }
3781 else{
3782 $non_bisulfite_sequence = '';
3783 }
3784 }
3785
3786 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
3787 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3788 ### [Index 2, sequence originated from complementary to (converted) forward strand]
3789 $counting{GA_CT_count}++;
3790 $alignment_strand = '-';
3791 $read_conversion_info = 'GA';
3792 $genome_conversion = 'CT';
3793
3794 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
3795 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3796 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
3797 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3798 ## reverse complement!
3799 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3800 }
3801 else{
3802 $non_bisulfite_sequence = '';
3803 }
3804 }
3805
3806 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
3807 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3808 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
3809 $counting{GA_GA_count}++;
3810 $alignment_strand = '+';
3811 $read_conversion_info = 'GA';
3812 $genome_conversion = 'GA';
3813
3814 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3815 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
3816 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
3817 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3818 }
3819 else{
3820 $non_bisulfite_sequence = '';
3821 }
3822 }
3823 else{
3824 die "Too many bowtie result filehandles\n";
3825 }
3826
3827 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
3828 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
3829 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3830 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3831
3832 ### at this point we can also determine the end position of a read
3833 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
3834 }
3835
3836 sub extract_corresponding_genomic_sequence_single_end_pbat {
3837 my ($sequence_identifier,$methylation_call_params) = @_;
3838 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3839 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3840
3841 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3842 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3843 my $alignment_strand;
3844 my $read_conversion_info;
3845 my $genome_conversion;
3846 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
3847 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
3848 ### if the C happens to be at the last position of the actually observed sequence
3849 my $non_bisulfite_sequence;
3850 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
3851
3852 my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1!
3853
3854 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
3855 if ($pbat_index == 0){
3856 ### [Index 0, sequence originated from (converted) forward strand]
3857 $counting{CT_CT_count}++;
3858 $alignment_strand = '+';
3859 $read_conversion_info = 'CT';
3860 $genome_conversion = 'CT';
3861
3862 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3863 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
3864 ### + 2 extra base at the 3' end
3865 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3866 }
3867 else{
3868 $non_bisulfite_sequence = '';
3869 }
3870 }
3871
3872 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
3873 elsif ($pbat_index == 1){
3874 ### [Index 1, sequence originated from (converted) reverse strand]
3875 $counting{CT_GA_count}++;
3876 $alignment_strand = '-';
3877 $read_conversion_info = 'CT';
3878 $genome_conversion = 'GA';
3879
3880 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3881 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
3882 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
3883 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3884 ## reverse complement!
3885 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3886 }
3887 else{
3888 $non_bisulfite_sequence = '';
3889 }
3890 }
3891
3892 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
3893 elsif ($pbat_index == 2){
3894 ### [Index 2, sequence originated from complementary to (converted) forward strand]
3895 $counting{GA_CT_count}++;
3896 $alignment_strand = '-';
3897 $read_conversion_info = 'GA';
3898 $genome_conversion = 'CT';
3899
3900 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
3901 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3902 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
3903 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3904 ## reverse complement!
3905 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3906 }
3907 else{
3908 $non_bisulfite_sequence = '';
3909 }
3910 }
3911
3912 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
3913 elsif ($pbat_index == 3){
3914 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
3915 $counting{GA_GA_count}++;
3916 $alignment_strand = '+';
3917 $read_conversion_info = 'GA';
3918 $genome_conversion = 'GA';
3919
3920 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3921 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
3922 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
3923 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3924 }
3925 else{
3926 $non_bisulfite_sequence = '';
3927 }
3928 }
3929 else{
3930 die "Too many bowtie result filehandles\n";
3931 }
3932
3933 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
3934 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
3935 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3936 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3937
3938 ### at this point we can also determine the end position of a read
3939 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
3940 }
3941
3942
3943 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
3944 my ($sequence_identifier,$methylation_call_params) = @_;
3945
3946 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
3947 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
3948
3949 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3950 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3951
3952 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3953 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3954 my $alignment_strand;
3955 my $read_conversion_info;
3956 my $genome_conversion;
3957 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
3958 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
3959 my $non_bisulfite_sequence = '';
3960
3961 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
3962 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
3963
3964 # parsing CIGAR string
3965 my @len = split (/\D+/,$cigar); # storing the length per operation
3966 my @ops = split (/\d+/,$cigar); # storing the operation
3967 shift @ops; # remove the empty first element
3968 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
3969
3970 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
3971 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3972 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3973 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
3974 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3975 return;
3976 }
3977 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
3978 }
3979 my $indels = 0;
3980
3981 foreach (0..$#len){
3982 if ($ops[$_] eq 'M'){
3983 #extracting genomic sequence
3984 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
3985 # adjusting position
3986 $pos += $len[$_];
3987 }
3988 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
3989 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3990 $non_bisulfite_sequence .= 'N' x $len[$_];
3991 # warn "$non_bisulfite_sequence\n";
3992 # position doesn't need to be adjusting
3993 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
3994 }
3995 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
3996 # we do not add any genomic sequence but only adjust the position
3997 $pos += $len[$_];
3998 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
3999 }
4000 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
4001 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
4002 }
4003 else{
4004 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
4005 }
4006 }
4007
4008 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
4009 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
4010 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
4011 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
4012 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
4013 return;
4014 }
4015 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
4016 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
4017 }
4018
4019
4020
4021 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
4022 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
4023 ### [Index 0, sequence originated from (converted) forward strand]
4024 $counting{CT_CT_count}++;
4025 $alignment_strand = '+';
4026 $read_conversion_info = 'CT';
4027 $genome_conversion = 'CT';
4028 }
4029
4030 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
4031 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
4032 ### [Index 1, sequence originated from (converted) reverse strand]
4033 $counting{CT_GA_count}++;
4034 $alignment_strand = '-';
4035 $read_conversion_info = 'CT';
4036 $genome_conversion = 'GA';
4037
4038 ### reverse complement!
4039 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
4040 }
4041
4042 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
4043 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
4044 ### [Index 2, sequence originated from complementary to (converted) forward strand]
4045 $counting{GA_CT_count}++;
4046 $alignment_strand = '-';
4047 $read_conversion_info = 'GA';
4048 $genome_conversion = 'CT';
4049
4050 ### reverse complement!
4051 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
4052 }
4053
4054 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
4055 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
4056 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
4057 $counting{GA_GA_count}++;
4058 $alignment_strand = '+';
4059 $read_conversion_info = 'GA';
4060 $genome_conversion = 'GA';
4061
4062 }
4063 else{
4064 die "Too many Bowtie 2 result filehandles\n";
4065 }
4066
4067 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
4068 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
4069 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
4070 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
4071
4072 ### the end position of a read is stored in $pos
4073 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
4074 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
4075 }
4076
4077 ### METHYLATION CALL
4078
4079 sub methylation_call{
4080 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
4081 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
4082 my @seq = split(//,$sequence_actually_observed);
4083 my @genomic = split(//,$genomic_sequence);
4084 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
4085 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
4086 ### CpG, CHH or CHG context
4087
4088 #################################################################
4089 ### . for bases not involving cytosines ###
4090 ### X for methylated C in CHG context (was protected) ###
4091 ### x for not methylated C in CHG context (was converted) ###
4092 ### H for methylated C in CHH context (was protected) ###
4093 ### h for not methylated C in CHH context (was converted) ###
4094 ### Z for methylated C in CpG context (was protected) ###
4095 ### z for not methylated C in CpG context (was converted) ###
4096 ### U for methylated C in unknown context (was protected) ###
4097 ### u for not methylated C in unknwon context (was converted) ###
4098 #################################################################
4099
4100 my @match =();
4101 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
4102 my $methyl_CHH_count = 0;
4103 my $methyl_CHG_count = 0;
4104 my $methyl_CpG_count = 0;
4105 my $methyl_C_unknown_count = 0;
4106 my $unmethylated_CHH_count = 0;
4107 my $unmethylated_CHG_count = 0;
4108 my $unmethylated_CpG_count = 0;
4109 my $unmethylated_C_unknown_count = 0;
4110
4111 if ($read_conversion eq 'CT'){
4112 for my $index (0..$#seq) {
4113 if ($seq[$index] eq $genomic[$index]) {
4114 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
4115 if ($genomic[$index] eq 'C') {
4116 ### If the residue is a C we want to know if it was in CpG context or in any other context
4117 my $downstream_base = $genomic[$index+1];
4118
4119 if ($downstream_base eq 'G'){
4120 ++$methyl_CpG_count;
4121 push @match,'Z'; # protected C, methylated, in CpG context
4122 }
4123 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
4124 ++$methyl_C_unknown_count;
4125 push @match,'U'; # protected C, methylated, in Unknown context
4126 }
4127 else {
4128 ### C in not in CpG-context, determining the second downstream base context
4129 my $second_downstream_base = $genomic[$index+2];
4130
4131 if ($second_downstream_base eq 'G'){
4132 ++$methyl_CHG_count;
4133 push @match,'X'; # protected C, methylated, in CHG context
4134 }
4135 elsif ($second_downstream_base eq 'N'){
4136 ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
4137 push @match,'U'; # protected C, methylated, in Unknown context
4138 }
4139 else{
4140 ++$methyl_CHH_count;
4141 push @match,'H'; # protected C, methylated, in CHH context
4142 }
4143 }
4144 }
4145 else {
4146 push @match, '.';
4147 }
4148 }
4149 elsif ($seq[$index] ne $genomic[$index]) {
4150 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
4151 ### in the actually observed sequence
4152 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
4153 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
4154 my $downstream_base = $genomic[$index+1];
4155
4156 if ($downstream_base eq 'G'){
4157 ++$unmethylated_CpG_count;
4158 push @match,'z'; # converted C, not methylated, in CpG context
4159 }
4160 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
4161 ++$unmethylated_C_unknown_count;
4162 push @match,'u'; # converted C, not methylated, in Unknown context
4163 }
4164 else{
4165 ### C in not in CpG-context, determining the second downstream base context
4166 my $second_downstream_base = $genomic[$index+2];
4167
4168 if ($second_downstream_base eq 'G'){
4169 ++$unmethylated_CHG_count;
4170 push @match,'x'; # converted C, not methylated, in CHG context
4171 }
4172 elsif ($second_downstream_base eq 'N'){
4173 ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
4174 push @match,'u'; # converted C, not methylated, in Unknown context
4175 }
4176 else{
4177 ++$unmethylated_CHH_count;
4178 push @match,'h'; # converted C, not methylated, in CHH context
4179 }
4180 }
4181 }
4182 ### all other mismatches are not of interest for a methylation call
4183 else {
4184 push @match,'.';
4185 }
4186 }
4187 else{
4188 die "There can be only 2 possibilities\n";
4189 }
4190 }
4191 }
4192 elsif ($read_conversion eq 'GA'){
4193 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
4194
4195 for my $index (0..$#seq) {
4196 if ($seq[$index] eq $genomic[$index+2]) {
4197 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
4198 if ($genomic[$index+2] eq 'G') {
4199 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
4200 ### to look if the base upstream is a C
4201
4202 my $upstream_base = $genomic[$index+1];
4203
4204 if ($upstream_base eq 'C'){
4205 ++$methyl_CpG_count;
4206 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
4207 }
4208 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
4209 ++$methyl_C_unknown_count;
4210 push @match,'U'; # protected C on opposing strand, methylated, in Unknown context
4211 }
4212 else{
4213 ### C in not in CpG-context, determining the second upstream base context
4214 my $second_upstream_base = $genomic[$index];
4215
4216 if ($second_upstream_base eq 'C'){
4217 ++$methyl_CHG_count;
4218 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
4219 }
4220 elsif ($second_upstream_base eq 'N'){
4221 ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
4222 push @match,'U'; # protected C, methylated, in Unknown context
4223 }
4224 else{
4225 ++$methyl_CHH_count;
4226 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
4227 }
4228 }
4229 }
4230 else{
4231 push @match, '.';
4232 }
4233 }
4234 elsif ($seq[$index] ne $genomic[$index+2]) {
4235 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
4236 ### on the opposing strand, so G to A conversions in the actually observed sequence
4237 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
4238 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
4239 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
4240
4241 my $upstream_base = $genomic[$index+1];
4242
4243 if ($upstream_base eq 'C'){
4244 ++$unmethylated_CpG_count;
4245 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
4246 }
4247 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
4248 ++$unmethylated_C_unknown_count;
4249 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
4250 }
4251 else{
4252 ### C in not in CpG-context, determining the second upstream base context
4253 my $second_upstream_base = $genomic[$index];
4254
4255 if ($second_upstream_base eq 'C'){
4256 ++$unmethylated_CHG_count;
4257 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
4258 }
4259 elsif ($second_upstream_base eq 'N'){
4260 ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
4261 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
4262 }
4263 else{
4264 ++$unmethylated_CHH_count;
4265 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
4266 }
4267 }
4268 }
4269 ### all other mismatches are not of interest for a methylation call
4270 else {
4271 push @match,'.';
4272 }
4273 }
4274 else{
4275 die "There can be only 2 possibilities\n";
4276 }
4277 }
4278 }
4279 else{
4280 die "Strand conversion info is required to perform a methylation call\n";
4281 }
4282
4283 my $methylation_call = join ("",@match);
4284
4285 $counting{total_meCHH_count} += $methyl_CHH_count;
4286 $counting{total_meCHG_count} += $methyl_CHG_count;
4287 $counting{total_meCpG_count} += $methyl_CpG_count;
4288 $counting{total_meC_unknown_count} += $methyl_C_unknown_count;
4289 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
4290 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
4291 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
4292 $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count;
4293
4294 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
4295 return $methylation_call;
4296 }
4297
4298 sub read_genome_into_memory{
4299 ## working directoy
4300 my $cwd = shift;
4301 ## reading in and storing the specified genome in the %chromosomes hash
4302 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
4303 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
4304
4305 my @chromosome_filenames = <*.fa>;
4306
4307 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
4308 unless (@chromosome_filenames){
4309 @chromosome_filenames = <*.fasta>;
4310 }
4311
4312 unless (@chromosome_filenames){
4313 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
4314 }
4315
4316 foreach my $chromosome_filename (@chromosome_filenames){
4317
4318 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
4319 ### first line needs to be a fastA header
4320 my $first_line = <CHR_IN>;
4321 chomp $first_line;
4322 $first_line =~ s/\r//;
4323
4324 ### Extracting chromosome name from the FastA header
4325 my $chromosome_name = extract_chromosome_name($first_line);
4326
4327 my $sequence;
4328 while (<CHR_IN>){
4329 chomp;
4330 $_ =~ s/\r//;
4331 if ($_ =~ /^>/){
4332 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
4333 if (exists $chromosomes{$chromosome_name}){
4334 print "chr $chromosome_name (",length $sequence ," bp)\n";
4335 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
4336 }
4337 else {
4338 if (length($sequence) == 0){
4339 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
4340 }
4341 print "chr $chromosome_name (",length $sequence ," bp)\n";
4342 $chromosomes{$chromosome_name} = $sequence;
4343 }
4344 ### resetting the sequence variable
4345 $sequence = '';
4346 ### setting new chromosome name
4347 $chromosome_name = extract_chromosome_name($_);
4348 }
4349 else{
4350 $sequence .= uc$_;
4351 }
4352 }
4353
4354 if (exists $chromosomes{$chromosome_name}){
4355 print "chr $chromosome_name (",length $sequence ," bp)\t";
4356 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
4357 }
4358 else{
4359 if (length($sequence) == 0){
4360 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
4361 }
4362 print "chr $chromosome_name (",length $sequence ," bp)\n";
4363 $chromosomes{$chromosome_name} = $sequence;
4364 }
4365 }
4366 print "\n";
4367 chdir $cwd or die "Failed to move to directory $cwd\n";
4368 }
4369
4370 sub extract_chromosome_name {
4371 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
4372 my $fasta_header = shift;
4373 if ($fasta_header =~ s/^>//){
4374 my ($chromosome_name) = split (/\s+/,$fasta_header);
4375 return $chromosome_name;
4376 }
4377 else{
4378 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
4379 }
4380 }
4381
4382 sub reverse_complement{
4383 my $sequence = shift;
4384 $sequence =~ tr/CATG/GTAC/;
4385 $sequence = reverse($sequence);
4386 return $sequence;
4387 }
4388
4389 sub biTransformFastAFiles {
4390 my $file = shift;
4391 my ($dir,$filename);
4392 if ($file =~ /\//){
4393 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4394 }
4395 else{
4396 $filename = $file;
4397 }
4398
4399 ### gzipped version of the infile
4400 if ($file =~ /\.gz$/){
4401 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4402 }
4403 else{
4404 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4405 }
4406
4407 if ($skip){
4408 warn "Skipping the first $skip reads from $file\n";
4409 sleep (1);
4410 }
4411 if ($upto){
4412 warn "Processing reads up to sequence no. $upto from $file\n";
4413 sleep (1);
4414 }
4415
4416 my $C_to_T_infile = my $G_to_A_infile = $filename;
4417
4418 if ($gzip){
4419 $C_to_T_infile =~ s/$/_C_to_T.fa.gz/;
4420 $G_to_A_infile =~ s/$/_G_to_A.fa.gz/;
4421 }
4422 else{
4423 $C_to_T_infile =~ s/$/_C_to_T.fa/;
4424 $G_to_A_infile =~ s/$/_G_to_A.fa/;
4425 }
4426
4427 if ($prefix){
4428 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
4429 $C_to_T_infile = "$prefix.$C_to_T_infile";
4430 $G_to_A_infile = "$prefix.$G_to_A_infile";
4431 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
4432 }
4433
4434 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4435
4436 if ($gzip){
4437 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
4438 }
4439 else{
4440 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4441 }
4442
4443 unless ($directional){
4444 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4445 if ($gzip){
4446 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4447 }
4448 else{
4449 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4450 }
4451 }
4452
4453 my $count = 0;
4454
4455 while (1){
4456 my $header = <IN>;
4457 my $sequence= <IN>;
4458 last unless ($header and $sequence);
4459
4460 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
4461
4462 ++$count;
4463
4464 if ($skip){
4465 next unless ($count > $skip);
4466 }
4467 if ($upto){
4468 last if ($count > $upto);
4469 }
4470
4471 $sequence = uc$sequence; # make input file case insensitive
4472
4473 # detecting if the input file contains tab stops, as this is likely to result in no alignments
4474 if (index($header,"\t") != -1){
4475 $seqID_contains_tabs++;
4476 }
4477
4478 ### small check if the sequence seems to be in FastA format
4479 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
4480
4481 my $sequence_C_to_T = $sequence;
4482 $sequence_C_to_T =~ tr/C/T/;
4483 print CTOT "$header$sequence_C_to_T";
4484
4485 unless ($directional){
4486 my $sequence_G_to_A = $sequence;
4487 $sequence_G_to_A =~ tr/G/A/;
4488 print GTOA "$header$sequence_G_to_A";
4489 }
4490 }
4491 close CTOT or die "Failed to close filehandle $!\n";
4492
4493 if ($directional){
4494 warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
4495 }
4496 else{
4497 close GTOA or die "Failed to close filehandle $!\n";
4498 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
4499 }
4500 return ($C_to_T_infile,$G_to_A_infile);
4501 }
4502
4503 sub biTransformFastAFiles_paired_end {
4504 my ($file,$read_number) = @_;
4505
4506 if ($gzip){
4507 warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n";
4508 sleep (2);
4509 }
4510
4511 my ($dir,$filename);
4512 if ($file =~ /\//){
4513 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4514 }
4515 else{
4516 $filename = $file;
4517 }
4518
4519 ### gzipped version of the infile
4520 if ($file =~ /\.gz$/){
4521 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4522 }
4523 else{
4524 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4525 }
4526
4527 if ($skip){
4528 warn "Skipping the first $skip reads from $file\n";
4529 sleep (1);
4530 }
4531 if ($upto){
4532 warn "Processing reads up to sequence no. $upto from $file\n";
4533 sleep (1);
4534 }
4535
4536 my $C_to_T_infile = my $G_to_A_infile = $filename;
4537
4538 $C_to_T_infile =~ s/$/_C_to_T.fa/;
4539 $G_to_A_infile =~ s/$/_G_to_A.fa/;
4540
4541 if ($prefix){
4542 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
4543 $C_to_T_infile = "$prefix.$C_to_T_infile";
4544 $G_to_A_infile = "$prefix.$G_to_A_infile";
4545 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
4546 }
4547
4548 if ($directional){
4549 if ($read_number == 1){
4550 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4551 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4552 }
4553 elsif ($read_number == 2){
4554 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4555 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4556 }
4557 else{
4558 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
4559 }
4560 }
4561 else{ # all four strand output
4562 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4563 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4564 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4565 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4566 }
4567
4568 my $count = 0;
4569
4570 while (1){
4571 my $header = <IN>;
4572 my $sequence= <IN>;
4573 last unless ($header and $sequence);
4574
4575 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
4576
4577 ++$count;
4578
4579 if ($skip){
4580 next unless ($count > $skip);
4581 }
4582 if ($upto){
4583 last if ($count > $upto);
4584 }
4585
4586 $sequence = uc$sequence; # make input file case insensitive
4587
4588 # detecting if the input file contains tab stops, as this is likely to result in no alignments
4589 if (index($header,"\t") != -1){
4590 $seqID_contains_tabs++;
4591 }
4592
4593 ## small check if the sequence seems to be in FastA format
4594 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/);
4595
4596 if ($read_number == 1){
4597 if ($bowtie2){
4598 $header =~ s/$/\/1\/1/;
4599 }
4600 else{
4601 $header =~ s/$/\/1/;
4602 }
4603 }
4604 elsif ($read_number == 2){
4605 if ($bowtie2){
4606 $header =~ s/$/\/2\/2/;
4607 }
4608 else{
4609 $header =~ s/$/\/2/;
4610 }
4611 }
4612 else{
4613 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
4614 }
4615 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
4616
4617 $sequence_C_to_T =~ tr/C/T/;
4618 $sequence_G_to_A =~ tr/G/A/;
4619
4620 if ($directional){
4621
4622 if ($read_number == 1){
4623 print CTOT "$header$sequence_C_to_T";
4624 }
4625 elsif ($read_number == 2){
4626 print GTOA "$header$sequence_G_to_A";
4627 }
4628 }
4629 else{
4630 print CTOT "$header$sequence_C_to_T";
4631 print GTOA "$header$sequence_G_to_A";
4632 }
4633 }
4634
4635 if ($directional){
4636 if ($read_number == 1){
4637 warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
4638 }
4639 else{
4640 warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
4641 }
4642 }
4643 else{
4644 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
4645 }
4646
4647 if ($directional){
4648 if ($read_number == 1){
4649 return ($C_to_T_infile);
4650 }
4651 else{
4652 return ($G_to_A_infile);
4653 }
4654 }
4655 else{
4656 return ($C_to_T_infile,$G_to_A_infile);
4657 }
4658 }
4659
4660
4661 sub biTransformFastQFiles {
4662 my $file = shift;
4663 my ($dir,$filename);
4664 if ($file =~ /\//){
4665 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4666 }
4667 else{
4668 $filename = $file;
4669 }
4670
4671 ### gzipped version of the infile
4672 if ($file =~ /\.gz$/){
4673 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4674 }
4675 else{
4676 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4677 }
4678
4679 if ($skip){
4680 warn "Skipping the first $skip reads from $file\n";
4681 sleep (1);
4682 }
4683 if ($upto){
4684 warn "Processing reads up to sequence no. $upto from $file\n";
4685 sleep (1);
4686 }
4687
4688 my $C_to_T_infile = my $G_to_A_infile = $filename;
4689
4690 if ($prefix){
4691 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
4692 $C_to_T_infile = "$prefix.$C_to_T_infile";
4693 $G_to_A_infile = "$prefix.$G_to_A_infile";
4694 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
4695 }
4696
4697 if ($pbat){ # PBAT-Seq
4698 if ($gzip){
4699 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
4700 }
4701 else{
4702 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4703 }
4704
4705 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4706
4707 if ($gzip){
4708 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4709 }
4710 else{
4711 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4712 }
4713 }
4714 else{ # directional or non-directional
4715 if ($gzip){
4716 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
4717 }
4718 else{
4719 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
4720 }
4721
4722 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4723
4724 if ($gzip){
4725 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
4726 }
4727 else{
4728 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option
4729 }
4730
4731 unless ($directional){
4732 if ($gzip){
4733 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
4734 }
4735 else{
4736 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4737 }
4738
4739 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4740
4741 if ($gzip){
4742 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4743 }
4744 else{
4745 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4746 }
4747 }
4748 }
4749
4750 my $count = 0;
4751 while (1){
4752 my $identifier = <IN>;
4753 my $sequence = <IN>;
4754 my $identifier2 = <IN>;
4755 my $quality_score = <IN>;
4756 last unless ($identifier and $sequence and $identifier2 and $quality_score);
4757
4758 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
4759
4760 ++$count;
4761
4762 if ($skip){
4763 next unless ($count > $skip);
4764 }
4765 if ($upto){
4766 last if ($count > $upto);
4767 }
4768
4769 $sequence = uc$sequence; # make input file case insensitive
4770
4771 # detecting if the input file contains tab stops, as this is likely to result in no alignments
4772 if (index($identifier,"\t") != -1){
4773 $seqID_contains_tabs++;
4774 }
4775
4776 ## small check if the sequence file appears to be a FastQ file
4777 if ($count == 1){
4778 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
4779 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
4780 }
4781 }
4782
4783 if ($pbat){
4784 my $sequence_G_to_A = $sequence;
4785 $sequence_G_to_A =~ tr/G/A/;
4786 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4787 }
4788 else{ # directional or non-directional
4789 my $sequence_C_to_T = $sequence;
4790 $sequence_C_to_T =~ tr/C/T/;
4791 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4792
4793 unless ($directional){
4794 my $sequence_G_to_A = $sequence;
4795 $sequence_G_to_A =~ tr/G/A/;
4796 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4797 }
4798 }
4799 }
4800
4801 if ($directional){
4802 close CTOT or die "Failed to close filehandle $!\n";
4803 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
4804 }
4805 elsif($pbat){
4806 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
4807 close GTOA or die "Failed to close filehandle $!\n";
4808 return ($G_to_A_infile);
4809 }
4810 else{
4811 close CTOT or die "Failed to close filehandle $!\n";
4812 close GTOA or die "Failed to close filehandle $!\n";
4813 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4814 }
4815
4816 return ($C_to_T_infile,$G_to_A_infile);
4817 }
4818
4819 sub biTransformFastQFiles_paired_end {
4820 my ($file,$read_number) = @_;
4821 my ($dir,$filename);
4822
4823 if ($file =~ /\//){
4824 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4825 }
4826 else{
4827 $filename = $file;
4828 }
4829
4830 ### gzipped version of the infile
4831 if ($file =~ /\.gz$/){
4832 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4833 }
4834 else{
4835 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4836 }
4837
4838 if ($skip){
4839 warn "Skipping the first $skip reads from $file\n";
4840 sleep (1);
4841 }
4842 if ($upto){
4843 warn "Processing reads up to sequence no. $upto from $file\n";
4844 sleep (1);
4845 }
4846
4847 my $C_to_T_infile = my $G_to_A_infile = $filename;
4848
4849 if ($gzip){
4850 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
4851 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
4852 }
4853 else{
4854 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
4855 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4856 }
4857
4858 if ($prefix){
4859 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
4860 $C_to_T_infile = "$prefix.$C_to_T_infile";
4861 $G_to_A_infile = "$prefix.$G_to_A_infile";
4862 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
4863 }
4864
4865 if ($directional){
4866 if ($read_number == 1){
4867 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4868 if ($gzip){
4869 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
4870 }
4871 else{
4872 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4873 }
4874 }
4875 elsif ($read_number == 2){
4876 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4877 if ($gzip){
4878 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4879 }
4880 else{
4881 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4882 }
4883 }
4884 else{
4885 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
4886 }
4887 }
4888 else{
4889 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4890 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4891 if ($gzip){
4892 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
4893 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
4894 }
4895 else{
4896 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4897 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4898 }
4899 }
4900
4901 my $count = 0;
4902 while (1){
4903 my $identifier = <IN>;
4904 my $sequence = <IN>;
4905 my $identifier2 = <IN>;
4906 my $quality_score = <IN>;
4907 last unless ($identifier and $sequence and $identifier2 and $quality_score);
4908 ++$count;
4909
4910 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
4911
4912 if ($skip){
4913 next unless ($count > $skip);
4914 }
4915 if ($upto){
4916 last if ($count > $upto);
4917 }
4918
4919 $sequence= uc$sequence; # make input file case insensitive
4920
4921 ## small check if the sequence file appears to be a FastQ file
4922 if ($count == 1){
4923 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
4924 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
4925 }
4926 }
4927 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
4928
4929 if ($read_number == 1){
4930 if ($bowtie2){
4931 $identifier =~ s/$/\/1\/1/;
4932 }
4933 else{
4934 $identifier =~ s/$/\/1/;
4935 }
4936 }
4937 elsif ($read_number == 2){
4938 if ($bowtie2){
4939 $identifier =~ s/$/\/2\/2/;
4940 }
4941 else{
4942 $identifier =~ s/$/\/2/;
4943 }
4944 }
4945 else{
4946 die "Read number needs to be 1 or 2\n";
4947 }
4948
4949 $sequence_C_to_T =~ tr/C/T/;
4950 $sequence_G_to_A =~ tr/G/A/;
4951
4952 if ($directional){
4953 if ($read_number == 1){
4954 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4955 }
4956 else{
4957 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4958 }
4959 }
4960 else{
4961 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4962 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4963 }
4964 }
4965
4966 if ($directional){
4967 if ($read_number == 1){
4968 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
4969 }
4970 else{
4971 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
4972 }
4973 }
4974 else{
4975 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4976 }
4977 if ($directional){
4978 if ($read_number == 1){
4979 close CTOT or die "Failed to close filehandle $!\n";
4980 return ($C_to_T_infile);
4981 }
4982 else{
4983 close GTOA or die "Failed to close filehandle $!\n";
4984 return ($G_to_A_infile);
4985 }
4986 }
4987 else{
4988 close CTOT or die "Failed to close filehandle $!\n";
4989 close GTOA or die "Failed to close filehandle $!\n";
4990 return ($C_to_T_infile,$G_to_A_infile);
4991 }
4992 }
4993
4994
4995 ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES
4996
4997 sub biTransformFastQFiles_paired_end_bowtie1_gzip {
4998 my ($file_1,$file_2) = @_;
4999 my ($dir,$filename);
5000
5001 if ($file_1 =~ /\//){
5002 ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/;
5003 }
5004 else{
5005 $filename = $file_1;
5006 }
5007
5008 ### gzipped version of infile 1
5009 if ($file_1 =~ /\.gz$/){
5010 open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n";
5011 }
5012 else{
5013 open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n";
5014 }
5015 ### gzipped version of infile 2
5016 if ($file_2 =~ /\.gz$/){
5017 open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n";
5018 }
5019 else{
5020 open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n";
5021 }
5022
5023
5024 if ($skip){
5025 warn "Skipping the first $skip reads from $file_1 and $file_2\n";
5026 sleep (1);
5027 }
5028 if ($upto){
5029 warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n";
5030 sleep (1);
5031 }
5032
5033 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename;
5034
5035 if ($prefix){
5036 # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n";
5037 $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile";
5038 $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile";
5039 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
5040 }
5041
5042 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/;
5043 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/;
5044 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
5045
5046 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n";
5047 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n";
5048 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n";
5049
5050 unless ($directional){
5051 print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n";
5052 open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n";
5053 }
5054
5055 ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format:
5056 ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate>
5057
5058 my $count = 0;
5059 while (1){
5060 my $identifier_1 = <IN_1>;
5061 my $sequence_1 = <IN_1>;
5062 my $identifier2_1 = <IN_1>;
5063 my $quality_score_1 = <IN_1>;
5064
5065 my $identifier_2 = <IN_2>;
5066 my $sequence_2 = <IN_2>;
5067 my $identifier2_2 = <IN_2>;
5068 my $quality_score_2 = <IN_2>;
5069
5070 last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2);
5071
5072 ++$count;
5073
5074 ## small check if the sequence file appears to be a FastQ file
5075 if ($count == 1){
5076 if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){
5077 die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n";
5078 }
5079 if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){
5080 die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n";
5081 }
5082 }
5083
5084 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
5085 chomp $identifier_1;
5086 chomp $sequence_1;
5087 chomp $sequence_2;
5088 chomp $quality_score_1;
5089 chomp $quality_score_2;
5090
5091 $identifier_1 =~ s/^\@//;
5092 $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever)
5093
5094 if ($skip){
5095 next unless ($count > $skip);
5096 }
5097 if ($upto){
5098 last if ($count > $upto);
5099 }
5100
5101 $sequence_1 = uc$sequence_1; # make input file 1 case insensitive
5102 $sequence_2 = uc$sequence_2; # make input file 2 case insensitive
5103
5104 # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n";
5105 my $sequence_1_C_to_T = $sequence_1;
5106 my $sequence_2_G_to_A = $sequence_2;
5107 $sequence_1_C_to_T =~ tr/C/T/;
5108 $sequence_2_G_to_A =~ tr/G/A/;
5109
5110 print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n";
5111
5112 unless ($directional){
5113 my $sequence_1_G_to_A = $sequence_1;
5114 my $sequence_2_C_to_T = $sequence_2;
5115 $sequence_1_G_to_A =~ tr/G/A/;
5116 $sequence_2_C_to_T =~ tr/C/T/;
5117 print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n";
5118 }
5119 }
5120
5121 close CTPLUSGA or die "Couldn't close filehandle\n";
5122 warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n";
5123
5124 if ($directional){
5125 warn "\n";
5126 return ($CT_plus_GA_infile);
5127 }
5128 else{
5129 close GAPLUSCT or die "Couldn't close filehandle\n";
5130 warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n";
5131 return ($CT_plus_GA_infile,$GA_plus_CT_infile);
5132 }
5133 }
5134
5135
5136 sub fix_IDs{
5137 my $id = shift;
5138 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
5139 return $id;
5140 }
5141
5142 sub ensure_sensical_alignment_orientation_single_end{
5143 my $index = shift; # index number if the sequence produced an alignment
5144 my $strand = shift;
5145 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
5146 my $orientation = 0;
5147 ##############################################################################################################
5148 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
5149 ## here we only want reads in the forward (+) orientation
5150 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
5151 ### if the alignment is (+) we count it, and return 1 for a correct orientation
5152 if ($strand eq '+') {
5153 $fhs[$index]->{seen}++;
5154 $orientation = 1;
5155 return $orientation;
5156 }
5157 ### if the orientation equals (-) the alignment is nonsensical
5158 elsif ($strand eq '-') {
5159 $fhs[$index]->{wrong_strand}++;
5160 return $orientation;
5161 }
5162 }
5163 ###############################################################################################################
5164 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
5165 ## here we only want reads in the forward (-) orientation
5166 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
5167 ### if the alignment is (-) we count it and return 1 for a correct orientation
5168 if ($strand eq '-') {
5169 $fhs[$index]->{seen}++;
5170 $orientation = 1;
5171 return $orientation;
5172 }
5173 ### if the orientation equals (+) the alignment is nonsensical
5174 elsif ($strand eq '+') {
5175 $fhs[$index]->{wrong_strand}++;
5176 return $orientation;
5177 }
5178 }
5179 ###############################################################################################################
5180 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
5181 ## here we only want reads in the forward (-) orientation
5182 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
5183 ### if the alignment is (-) we count it and return 1 for a correct orientation
5184 if ($strand eq '-') {
5185 $fhs[$index]->{seen}++;
5186 $orientation = 1;
5187 return $orientation;
5188 }
5189 ### if the orientation equals (+) the alignment is nonsensical
5190 elsif ($strand eq '+') {
5191 $fhs[$index]->{wrong_strand}++;
5192 return $orientation;
5193 }
5194 }
5195 ###############################################################################################################
5196 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
5197 ## here we only want reads in the forward (+) orientation
5198 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
5199 ### if the alignment is (+) we count it and return 1 for a correct orientation
5200 if ($strand eq '+') {
5201 $fhs[$index]->{seen}++;
5202 $orientation = 1;
5203 return $orientation;
5204 }
5205 ### if the orientation equals (-) the alignment is nonsensical
5206 elsif ($strand eq '-') {
5207 $fhs[$index]->{wrong_strand}++;
5208 return $orientation;
5209 }
5210 } else{
5211 die "One of the above conditions must be true\n";
5212 }
5213 }
5214
5215 sub ensure_sensical_alignment_orientation_paired_ends{
5216 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
5217 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
5218 my $orientation = 0;
5219 ##############################################################################################################
5220 ## [Index 0, sequence originated from (converted) forward strand]
5221 ## CT converted read 1
5222 ## GA converted read 2
5223 ## CT converted genome
5224 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
5225 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
5226 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
5227 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
5228 $fhs[$index]->{seen}++;
5229 $orientation = 1;
5230 return $orientation;
5231 }
5232 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
5233 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
5234 $fhs[$index]->{wrong_strand}++;
5235 return $orientation;
5236 }
5237 else{
5238 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
5239 }
5240 }
5241 ###############################################################################################################
5242 ## [Index 1, sequence originated from (converted) reverse strand]
5243 ## GA converted read 1
5244 ## CT converted read 2
5245 ## GA converted genome
5246 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
5247 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
5248 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
5249 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
5250 $fhs[$index]->{seen}++;
5251 $orientation = 1;
5252 return $orientation;
5253 }
5254 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
5255 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
5256 $fhs[$index]->{wrong_strand}++;
5257 return $orientation;
5258 }
5259 else{
5260 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
5261 }
5262 }
5263 ###############################################################################################################
5264 ## [Index 2, sequence originated from complementary to (converted) forward strand]
5265 ## GA converted read 1
5266 ## CT converted read 2
5267 ## CT converted genome
5268 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
5269 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
5270 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
5271 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
5272 $fhs[$index]->{seen}++;
5273 $orientation = 1;
5274 return $orientation;
5275 }
5276 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
5277 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
5278 $fhs[$index]->{wrong_strand}++;
5279 return $orientation;
5280 }
5281 else{
5282 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
5283 }
5284 }
5285 ###############################################################################################################
5286 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
5287 ## CT converted read 1
5288 ## GA converted read 2
5289 ## GA converted genome
5290 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
5291 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
5292 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
5293 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
5294 $fhs[$index]->{seen}++;
5295 $orientation = 1;
5296 return $orientation;
5297 }
5298 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
5299 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
5300 $fhs[$index]->{wrong_strand}++;
5301 return $orientation;
5302 }
5303 else{
5304 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
5305 }
5306 }
5307 else{
5308 die "One of the above conditions must be true\n";
5309 }
5310 }
5311
5312 #####################################################################################################################################################
5313
5314 ### Bowtie 1 (default) | PAIRED-END | FASTA
5315
5316 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
5317
5318 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
5319
5320 if ($directional){
5321 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
5322 }
5323 else{
5324 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
5325 }
5326
5327 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
5328 ## data structure above
5329 if ($directional){
5330 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5331 }
5332 else{
5333 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5334 }
5335
5336 foreach my $fh (@fhs) {
5337
5338 if ($directional){
5339 unless ($fh->{inputfile_1}){
5340 $fh->{last_seq_id} = undef;
5341 $fh->{last_line_1} = undef;
5342 $fh->{last_line_2} = undef;
5343 next;
5344 }
5345 }
5346
5347 my $bt_options = $bowtie_options;
5348 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
5349 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5350 }
5351 else {
5352 $bt_options .= ' --nofw';
5353 }
5354
5355 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
5356 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
5357
5358 my $line_1 = $fh->{fh}->getline();
5359 my $line_2 = $fh->{fh}->getline();
5360
5361 # if Bowtie produces an alignment we store the first line of the output
5362 if ($line_1 and $line_2) {
5363 chomp $line_1;
5364 chomp $line_2;
5365 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
5366 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
5367
5368 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
5369 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
5370
5371 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
5372 $fh->{last_seq_id} = $id_1;
5373 }
5374 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
5375 $fh->{last_seq_id} = $id_2;
5376 }
5377 else{
5378 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
5379 }
5380
5381 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
5382 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
5383 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
5384 }
5385 # otherwise we just initialise last_seq_id and last_lines as undefined
5386 else {
5387 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
5388 $fh->{last_seq_id} = undef;
5389 $fh->{last_line_1} = undef;
5390 $fh->{last_line_2} = undef;
5391 }
5392 }
5393 }
5394
5395 ### Bowtie 2 | PAIRED-END | FASTA
5396
5397 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
5398 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
5399 if ($directional){
5400 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
5401 }
5402 else{
5403 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
5404 }
5405
5406 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
5407 ## data structure above
5408 if ($directional){
5409 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5410 }
5411 else{
5412 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5413 }
5414
5415 foreach my $fh (@fhs) {
5416
5417 if ($directional){
5418 unless ($fh->{inputfile_1}){
5419 $fh->{last_seq_id} = undef;
5420 $fh->{last_line_1} = undef;
5421 $fh->{last_line_2} = undef;
5422 next;
5423 }
5424 }
5425
5426 my $bt2_options = $bowtie_options;
5427 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
5428 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5429 }
5430 else {
5431 $bt2_options .= ' --nofw';
5432 }
5433
5434 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
5435 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
5436
5437 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
5438 while (1){
5439 $_ = $fh->{fh}->getline();
5440 if ($_) {
5441 last unless ($_ =~ /^\@/); # SAM headers start with @
5442 }
5443 else{
5444 last; # no alignment output
5445 }
5446 }
5447
5448 my $line_1 = $_;
5449 my $line_2 = $fh->{fh}->getline();
5450
5451 # if Bowtie produces an alignment we store the first line of the output
5452 if ($line_1 and $line_2) {
5453 chomp $line_1;
5454 chomp $line_2;
5455 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
5456 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
5457
5458 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
5459 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
5460
5461 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
5462 $fh->{last_seq_id} = $id_1;
5463 }
5464 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
5465 $fh->{last_seq_id} = $id_2;
5466 }
5467 else{
5468 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
5469 }
5470
5471 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
5472 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
5473 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
5474 }
5475 # otherwise we just initialise last_seq_id and last_lines as undefined
5476 else {
5477 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
5478 $fh->{last_seq_id} = undef;
5479 $fh->{last_line_1} = undef;
5480 $fh->{last_line_2} = undef;
5481 }
5482 }
5483 }
5484
5485 ### Bowtie 1 (default) | PAIRED-END | FASTQ
5486
5487 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
5488 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
5489
5490 if ($directional){
5491 warn "Input file is $C_to_T_infile_1 (FastQ)\n";
5492 }
5493 elsif($pbat){
5494 warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n";
5495 }
5496 else{
5497 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n";
5498 }
5499
5500 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
5501 ## data structure above
5502 if ($directional or $pbat){
5503 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5504 }
5505 else{
5506 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5507 }
5508
5509 foreach my $fh (@fhs) {
5510
5511 if ($directional or $pbat){
5512 unless ($fh->{inputfile_1}){
5513 $fh->{last_seq_id} = undef;
5514 $fh->{last_line_1} = undef;
5515 $fh->{last_line_2} = undef;
5516 next; # skipping unwanted filehandles
5517 }
5518 }
5519
5520 my $bt_options = $bowtie_options;
5521 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
5522 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5523 }
5524 else {
5525 $bt_options .= ' --nofw';
5526 }
5527
5528 if ($gzip){
5529 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n";
5530 open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!";
5531 }
5532 else{
5533 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n";
5534 sleep(5);
5535 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
5536 }
5537
5538 my $line_1 = $fh->{fh}->getline();
5539 my $line_2 = $fh->{fh}->getline();
5540
5541 # if Bowtie produces an alignment we store the first line of the output
5542 if ($line_1 and $line_2) {
5543 chomp $line_1;
5544 chomp $line_2;
5545 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
5546 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
5547
5548 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
5549 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
5550
5551 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
5552 $fh->{last_seq_id} = $id_1;
5553 }
5554 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
5555 $fh->{last_seq_id} = $id_2;
5556 }
5557 else{
5558 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
5559 }
5560
5561 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
5562 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
5563 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
5564 }
5565
5566 # otherwise we just initialise last_seq_id and last_lines as undefined
5567 else {
5568 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
5569 $fh->{last_seq_id} = undef;
5570 $fh->{last_line_1} = undef;
5571 $fh->{last_line_2} = undef;
5572 }
5573 }
5574 }
5575
5576 ### Bowtie 2 | PAIRED-END | FASTQ
5577
5578 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
5579 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
5580 if ($directional){
5581 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
5582 }
5583 else{
5584 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
5585 }
5586
5587 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
5588 ## data structure above
5589 if ($directional){
5590 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5591 }
5592 else{
5593 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5594 }
5595
5596 foreach my $fh (@fhs) {
5597
5598 if ($directional){
5599 unless ($fh->{inputfile_1}){
5600 $fh->{last_seq_id} = undef;
5601 $fh->{last_line_1} = undef;
5602 $fh->{last_line_2} = undef;
5603 next;
5604 }
5605 }
5606
5607 my $bt2_options = $bowtie_options;
5608 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
5609 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5610 }
5611 else {
5612 $bt2_options .= ' --nofw';
5613 }
5614
5615 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
5616 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
5617
5618 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
5619 while (1){
5620 $_ = $fh->{fh}->getline();
5621 if ($_) {
5622 last unless ($_ =~ /^\@/); # SAM headers start with @
5623 }
5624 else{
5625 last; # no alignment output
5626 }
5627 }
5628
5629 my $line_1 = $_;
5630 my $line_2 = $fh->{fh}->getline();
5631
5632 # if Bowtie produces an alignment we store the first line of the output
5633 if ($line_1 and $line_2) {
5634 chomp $line_1;
5635 chomp $line_2;
5636 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
5637 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
5638
5639 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
5640 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
5641
5642 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
5643 $fh->{last_seq_id} = $id_1;
5644 }
5645 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
5646 $fh->{last_seq_id} = $id_2;
5647 }
5648 else{
5649 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
5650 }
5651
5652 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
5653 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
5654 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
5655 }
5656
5657 # otherwise we just initialise last_seq_id and last_lines as undefined
5658 else {
5659 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
5660 $fh->{last_seq_id} = undef;
5661 $fh->{last_line_1} = undef;
5662 $fh->{last_line_2} = undef;
5663 }
5664 }
5665 }
5666
5667 #####################################################################################################################################################
5668
5669 ### Bowtie 1 (default) | SINGLE-END | FASTA
5670 sub single_end_align_fragments_to_bisulfite_genome_fastA {
5671 my ($C_to_T_infile,$G_to_A_infile) = @_;
5672 if ($directional){
5673 warn "Input file is $C_to_T_infile (FastA)\n";
5674 }
5675 else{
5676 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
5677 }
5678
5679 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
5680 ## data structure above
5681 if ($directional){
5682 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5683 }
5684 else{
5685 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5686 }
5687
5688 foreach my $fh (@fhs) {
5689
5690 my $bt_options = $bowtie_options;
5691 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
5692 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5693 }
5694 else {
5695 $bt_options .= ' --nofw';
5696 }
5697
5698 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
5699 if ($gzip){
5700 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
5701 }
5702 else{
5703 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
5704 }
5705
5706 # if Bowtie produces an alignment we store the first line of the output
5707 $_ = $fh->{fh}->getline();
5708 if ($_) {
5709 chomp;
5710 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
5711 $fh->{last_seq_id} = $id;
5712 $fh->{last_line} = $_;
5713 warn "Found first alignment:\t$fh->{last_line}\n";
5714 }
5715 # otherwise we just initialise last_seq_id and last_line as undefined
5716 else {
5717 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
5718 $fh->{last_seq_id} = undef;
5719 $fh->{last_line} = undef;
5720 }
5721 }
5722 }
5723
5724 ### Bowtie 2 | SINGLE-END | FASTA
5725 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
5726 my ($C_to_T_infile,$G_to_A_infile) = @_;
5727 if ($directional){
5728 warn "Input file is $C_to_T_infile (FastA)\n";
5729 }
5730 else{
5731 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
5732 }
5733
5734 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
5735 ## data structure above
5736 if ($directional){
5737 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5738 }
5739 else{
5740 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5741 }
5742
5743 foreach my $fh (@fhs) {
5744
5745 my $bt2_options = $bowtie_options;
5746 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
5747 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5748 }
5749 else {
5750 $bt2_options .= ' --nofw';
5751 }
5752
5753 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
5754 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
5755
5756 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
5757 while (1){
5758 $_ = $fh->{fh}->getline();
5759 if ($_) {
5760 last unless ($_ =~ /^\@/); # SAM headers start with @
5761 }
5762 else{
5763 last; # no alignment output
5764 }
5765 }
5766
5767 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
5768 if ($_) {
5769 chomp;
5770 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
5771 $fh->{last_seq_id} = $id;
5772 $fh->{last_line} = $_;
5773 warn "Found first alignment:\t$fh->{last_line}\n";
5774 }
5775 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
5776 else {
5777 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
5778 $fh->{last_seq_id} = undef;
5779 $fh->{last_line} = undef;
5780 }
5781 }
5782 }
5783
5784
5785 ### Bowtie 1 (default) | SINGLE-END | FASTQ
5786 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
5787 my ($C_to_T_infile,$G_to_A_infile) = @_;
5788 if ($directional){
5789 warn "Input file is $C_to_T_infile (FastQ)\n";
5790 }
5791 elsif($pbat){
5792 warn "Input file is $G_to_A_infile (FastQ)\n";
5793 }
5794 else{
5795 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
5796 }
5797
5798
5799 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
5800 ## the data structure above
5801 if ($directional or $pbat){
5802 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5803 }
5804 else{
5805 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5806 }
5807
5808 foreach my $fh (@fhs) {
5809 my $bt_options = $bowtie_options;
5810 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
5811 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5812 }
5813 else {
5814 $bt_options .= ' --nofw';
5815 }
5816
5817 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
5818 sleep (5);
5819
5820 if ($gzip){
5821 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
5822 }
5823 else{
5824 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
5825 }
5826
5827 # if Bowtie produces an alignment we store the first line of the output
5828 $_ = $fh->{fh}->getline();
5829 if ($_) {
5830 chomp;
5831 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
5832 $fh->{last_seq_id} = $id;
5833 $fh->{last_line} = $_;
5834 warn "Found first alignment:\t$fh->{last_line}\n";
5835 }
5836 # otherwise we just initialise last_seq_id and last_line as undefined
5837 else {
5838 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
5839 $fh->{last_seq_id} = undef;
5840 $fh->{last_line} = undef;
5841 }
5842 }
5843 }
5844
5845 ### Bowtie 2 | SINGLE-END | FASTQ
5846 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
5847
5848 my ($C_to_T_infile,$G_to_A_infile) = @_;
5849 if ($directional){
5850 warn "Input file is $C_to_T_infile (FastQ)\n\n";
5851 }
5852 else{
5853 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
5854 }
5855
5856 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
5857 ## the data structure above
5858 if ($directional){
5859 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5860 }
5861 else{
5862 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
5863 }
5864 foreach my $fh (@fhs) {
5865 my $bt2_options = $bowtie_options;
5866 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
5867 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
5868 }
5869 else {
5870 $bt2_options .= ' --nofw';
5871 }
5872 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
5873 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
5874
5875 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
5876 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
5877 while (1){
5878 $_ = $fh->{fh}->getline();
5879 # warn "$_\n";
5880 # sleep(1);
5881 if ($_) {
5882 last unless ($_ =~ /^\@/); # SAM headers start with @
5883 }
5884 else {
5885 last;
5886 }
5887 }
5888
5889 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
5890 if ($_) {
5891 chomp;
5892 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
5893 $fh->{last_seq_id} = $id;
5894 $fh->{last_line} = $_;
5895 warn "Found first alignment:\t$fh->{last_line}\n";
5896 # warn "storing $id and\n$_\n";
5897 }
5898 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
5899 else {
5900 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
5901 $fh->{last_seq_id} = undef;
5902 $fh->{last_line} = undef;
5903 }
5904 }
5905 }
5906
5907 ###########################################################################################################################################
5908
5909 sub reset_counters_and_fhs{
5910 my $filename = shift;
5911 %counting=(
5912 total_meCHH_count => 0,
5913 total_meCHG_count => 0,
5914 total_meCpG_count => 0,
5915 total_meC_unknown_count => 0,
5916 total_unmethylated_CHH_count => 0,
5917 total_unmethylated_CHG_count => 0,
5918 total_unmethylated_CpG_count => 0,
5919 total_unmethylated_C_unknown_count => 0,
5920 sequences_count => 0,
5921 no_single_alignment_found => 0,
5922 unsuitable_sequence_count => 0,
5923 genomic_sequence_could_not_be_extracted_count => 0,
5924 unique_best_alignment_count => 0,
5925 low_complexity_alignments_overruled_count => 0,
5926 CT_CT_count => 0, #(CT read/CT genome, original top strand)
5927 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
5928 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
5929 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
5930 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
5931 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
5932 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
5933 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
5934 alignments_rejected_count => 0, # only relevant if --directional was specified
5935 );
5936
5937 if ($directional){
5938 if ($filename =~ ','){ # paired-end files
5939 @fhs=(
5940 { name => 'CTreadCTgenome',
5941 strand_identity => 'con ori forward',
5942 bisulfiteIndex => $CT_index_basename,
5943 seen => 0,
5944 wrong_strand => 0,
5945 },
5946 { name => 'CTreadGAgenome',
5947 strand_identity => 'con ori reverse',
5948 bisulfiteIndex => $GA_index_basename,
5949 seen => 0,
5950 wrong_strand => 0,
5951 },
5952 { name => 'GAreadCTgenome',
5953 strand_identity => 'compl ori con forward',
5954 bisulfiteIndex => $CT_index_basename,
5955 seen => 0,
5956 wrong_strand => 0,
5957 },
5958 { name => 'GAreadGAgenome',
5959 strand_identity => 'compl ori con reverse',
5960 bisulfiteIndex => $GA_index_basename,
5961 seen => 0,
5962 wrong_strand => 0,
5963 },
5964 );
5965 }
5966 else{ # single-end files
5967 @fhs=(
5968 { name => 'CTreadCTgenome',
5969 strand_identity => 'con ori forward',
5970 bisulfiteIndex => $CT_index_basename,
5971 seen => 0,
5972 wrong_strand => 0,
5973 },
5974 { name => 'CTreadGAgenome',
5975 strand_identity => 'con ori reverse',
5976 bisulfiteIndex => $GA_index_basename,
5977 seen => 0,
5978 wrong_strand => 0,
5979 },
5980 );
5981 }
5982 }
5983 elsif($pbat){
5984 if ($filename =~ ','){ # paired-end files
5985 @fhs=(
5986 { name => 'CTreadCTgenome',
5987 strand_identity => 'con ori forward',
5988 bisulfiteIndex => $CT_index_basename,
5989 seen => 0,
5990 wrong_strand => 0,
5991 },
5992 { name => 'CTreadGAgenome',
5993 strand_identity => 'con ori reverse',
5994 bisulfiteIndex => $GA_index_basename,
5995 seen => 0,
5996 wrong_strand => 0,
5997 },
5998 { name => 'GAreadCTgenome',
5999 strand_identity => 'compl ori con forward',
6000 bisulfiteIndex => $CT_index_basename,
6001 seen => 0,
6002 wrong_strand => 0,
6003 },
6004 { name => 'GAreadGAgenome',
6005 strand_identity => 'compl ori con reverse',
6006 bisulfiteIndex => $GA_index_basename,
6007 seen => 0,
6008 wrong_strand => 0,
6009 },
6010 );
6011 }
6012 else{ # single-end files
6013 @fhs=(
6014 { name => 'GAreadCTgenome',
6015 strand_identity => 'compl ori con forward',
6016 bisulfiteIndex => $CT_index_basename,
6017 seen => 0,
6018 wrong_strand => 0,
6019 },
6020 { name => 'GAreadGAgenome',
6021 strand_identity => 'compl ori con reverse',
6022 bisulfiteIndex => $GA_index_basename,
6023 seen => 0,
6024 wrong_strand => 0,
6025 },
6026 );
6027 }
6028 }
6029 else{
6030 @fhs=(
6031 { name => 'CTreadCTgenome',
6032 strand_identity => 'con ori forward',
6033 bisulfiteIndex => $CT_index_basename,
6034 seen => 0,
6035 wrong_strand => 0,
6036 },
6037 { name => 'CTreadGAgenome',
6038 strand_identity => 'con ori reverse',
6039 bisulfiteIndex => $GA_index_basename,
6040 seen => 0,
6041 wrong_strand => 0,
6042 },
6043 { name => 'GAreadCTgenome',
6044 strand_identity => 'compl ori con forward',
6045 bisulfiteIndex => $CT_index_basename,
6046 seen => 0,
6047 wrong_strand => 0,
6048 },
6049 { name => 'GAreadGAgenome',
6050 strand_identity => 'compl ori con reverse',
6051 bisulfiteIndex => $GA_index_basename,
6052 seen => 0,
6053 wrong_strand => 0,
6054 },
6055 );
6056 }
6057 }
6058
6059
6060 sub process_command_line{
6061 my @bowtie_options;
6062 my $help;
6063 my $mates1;
6064 my $mates2;
6065 my $path_to_bowtie;
6066 my $fastq;
6067 my $fasta;
6068 my $skip;
6069 my $qupto;
6070 my $phred64;
6071 my $phred33;
6072 my $solexa;
6073 my $mismatches;
6074 my $seed_length;
6075 my $best;
6076 my $sequence_format;
6077 my $version;
6078 my $quiet;
6079 my $chunk;
6080 my $non_directional;
6081 my $ceiling;
6082 my $maxins;
6083 my $minins;
6084 my $unmapped;
6085 my $multi_map;
6086 my $output_dir;
6087 my $bowtie2;
6088 my $vanilla;
6089 my $sam_no_hd;
6090 my $seed_extension_fails;
6091 my $reseed_repetitive_seeds;
6092 my $most_valid_alignments;
6093 my $score_min;
6094 my $parallel;
6095 my $temp_dir;
6096 my $rdg;
6097 my $rfg;
6098 my $non_bs_mm;
6099 my $samtools_path;
6100 my $bam;
6101 my $gzip;
6102 my $pbat;
6103 my $prefix;
6104 my $old_flag;
6105
6106 my $command_line = GetOptions ('help|man' => \$help,
6107 '1=s' => \$mates1,
6108 '2=s' => \$mates2,
6109 'path_to_bowtie=s' => \$path_to_bowtie,
6110 'f|fasta' => \$fasta,
6111 'q|fastq' => \$fastq,
6112 's|skip=i' => \$skip,
6113 'u|upto=i' => \$qupto,
6114 'phred33-quals' => \$phred33,
6115 'phred64-quals|solexa1' => \$phred64,
6116 'solexa-quals' => \$solexa,
6117 'n|seedmms=i' => \$mismatches,
6118 'l|seedlen=i' => \$seed_length,
6119 'no_best' => \$best,
6120 'version' => \$version,
6121 'quiet' => \$quiet,
6122 'chunkmbs=i' => \$chunk,
6123 'non_directional' => \$non_directional,
6124 'I|minins=i' => \$minins,
6125 'X|maxins=i' => \$maxins,
6126 'e|maqerr=i' => \$ceiling,
6127 'un|unmapped' => \$unmapped,
6128 'ambiguous' => \$multi_map,
6129 'o|output_dir=s' => \$output_dir,
6130 'bowtie2' => \$bowtie2,
6131 'vanilla' => \$vanilla,
6132 'sam-no-hd' => \$sam_no_hd,
6133 'D=i' => \$seed_extension_fails,
6134 'R=i' => \$reseed_repetitive_seeds,
6135 'score_min=s' => \$score_min,
6136 'most_valid_alignments=i' => \$most_valid_alignments,
6137 'p=i' => \$parallel,
6138 'temp_dir=s' => \$temp_dir,
6139 'rdg=s' => \$rdg,
6140 'rfg=s' => \$rfg,
6141 'non_bs_mm' => \$non_bs_mm,
6142 'samtools_path=s' => \$samtools_path,
6143 'bam' => \$bam,
6144 'gzip' => \$gzip,
6145 'pbat' => \$pbat,
6146 'prefix=s' => \$prefix,
6147 'old_flag' => \$old_flag,
6148 );
6149
6150
6151 ### EXIT ON ERROR if there were errors with any of the supplied options
6152 unless ($command_line){
6153 die "Please respecify command line options\n";
6154 }
6155 ### HELPFILE
6156 if ($help){
6157 print_helpfile();
6158 exit;
6159 }
6160 if ($version){
6161 print << "VERSION";
6162
6163
6164 Bismark - Bisulfite Mapper and Methylation Caller.
6165
6166 Bismark Version: $bismark_version
6167 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics
6168 www.bioinformatics.babraham.ac.uk/projects/
6169
6170
6171 VERSION
6172 exit;
6173 }
6174
6175
6176 ##########################
6177 ### PROCESSING OPTIONS ###
6178 ##########################
6179
6180 unless ($bowtie2){
6181 $bowtie2 = 0;
6182 }
6183 unless ($sam_no_hd){
6184 $sam_no_hd =0;
6185 }
6186
6187 ### PATH TO BOWTIE
6188 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
6189 if ($path_to_bowtie){
6190 unless ($path_to_bowtie =~ /\/$/){
6191 $path_to_bowtie =~ s/$/\//;
6192 }
6193 if (-d $path_to_bowtie){
6194 if ($bowtie2){
6195 $path_to_bowtie = "${path_to_bowtie}bowtie2";
6196 }
6197 else{
6198 $path_to_bowtie = "${path_to_bowtie}bowtie";
6199 }
6200 }
6201 else{
6202 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
6203 }
6204 }
6205 else{
6206 if ($bowtie2){
6207 $path_to_bowtie = 'bowtie2';
6208 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
6209 else{
6210 $path_to_bowtie = 'bowtie';
6211 warn "Path to Bowtie specified as: $path_to_bowtie\n";
6212 }
6213 }
6214
6215 ### OUTPUT REQUESTED AS BAM FILE
6216 if ($bam){
6217 if ($vanilla){
6218 die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n";
6219 }
6220
6221 ### PATH TO SAMTOOLS
6222 if (defined $samtools_path){
6223 # if Samtools was specified as full command
6224 if ($samtools_path =~ /samtools$/){
6225 if (-e $samtools_path){
6226 # Samtools executable found
6227 }
6228 else{
6229 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
6230 }
6231 }
6232 else{
6233 unless ($samtools_path =~ /\/$/){
6234 $samtools_path =~ s/$/\//;
6235 }
6236 $samtools_path .= 'samtools';
6237 if (-e $samtools_path){
6238 # Samtools executable found
6239 }
6240 else{
6241 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
6242 }
6243 }
6244
6245 warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n";
6246 $bam = 1;
6247 }
6248 # Check whether Samtools is in the PATH if no path was supplied by the user
6249 else{
6250 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH
6251 $samtools_path = `which samtools`;
6252 chomp $samtools_path;
6253 warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n";
6254 $bam = 1;
6255 }
6256 }
6257
6258 unless (defined $samtools_path){
6259 $bam = 2;
6260 warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n";
6261 }
6262 sleep (1);
6263 }
6264
6265
6266 ####################################
6267 ### PROCESSING ARGUMENTS
6268
6269 ### GENOME FOLDER
6270 my $genome_folder = shift @ARGV; # mandatory
6271 unless ($genome_folder){
6272 warn "Genome folder was not specified!\n";
6273 print_helpfile();
6274 exit;
6275 }
6276
6277 ### checking that the genome folder, all subfolders and the required bowtie index files exist
6278 unless ($genome_folder =~/\/$/){
6279 $genome_folder =~ s/$/\//;
6280 }
6281
6282 if (chdir $genome_folder){
6283 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
6284 unless ($absolute_genome_folder =~/\/$/){
6285 $absolute_genome_folder =~ s/$/\//;
6286 }
6287 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
6288 $genome_folder = $absolute_genome_folder;
6289 }
6290 else{
6291 die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
6292 }
6293
6294 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
6295 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
6296
6297 if ($bowtie2){ ### Bowtie 2 (new)
6298 ### checking the integrity of $CT_dir
6299 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
6300 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
6301 foreach my $file(@CT_bowtie_index){
6302 unless (-f $file){
6303 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n";
6304 }
6305 }
6306 ### checking the integrity of $GA_dir
6307 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
6308 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
6309 foreach my $file(@GA_bowtie_index){
6310 unless (-f $file){
6311 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n";
6312 }
6313 }
6314 }
6315
6316 else{ ### Bowtie 1 (default)
6317 ### checking the integrity of $CT_dir
6318 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
6319 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
6320 foreach my $file(@CT_bowtie_index){
6321 unless (-f $file){
6322 die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
6323 }
6324 }
6325 ### checking the integrity of $GA_dir
6326 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
6327 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
6328 foreach my $file(@GA_bowtie_index){
6329 unless (-f $file){
6330 die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
6331 }
6332 }
6333 }
6334
6335 my $CT_index_basename = "${CT_dir}BS_CT";
6336 my $GA_index_basename = "${GA_dir}BS_GA";
6337
6338 ### INPUT OPTIONS
6339
6340 ### SEQUENCE FILE FORMAT
6341 ### exits if both fastA and FastQ were specified
6342 if ($fasta and $fastq){
6343 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
6344 }
6345
6346 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
6347 if ($fasta){
6348 print "FastA format specified\n";
6349 $sequence_format = 'FASTA';
6350 push @bowtie_options, '-f';
6351 }
6352 elsif ($fastq){
6353 print "FastQ format specified\n";
6354 $sequence_format = 'FASTQ';
6355 push @bowtie_options, '-q';
6356 }
6357 else{
6358 $fastq = 1;
6359 print "FastQ format assumed (by default)\n";
6360 $sequence_format = 'FASTQ';
6361 push @bowtie_options, '-q';
6362 }
6363
6364 ### SKIP
6365 if ($skip){
6366 warn "Skipping the first $skip reads from the input file\n";
6367 # push @bowtie_options,"-s $skip";
6368 }
6369
6370 ### UPTO
6371 if ($qupto){
6372 warn "Processing sequences up to read no. $qupto from the input file\n";
6373 if ($bowtie2){
6374 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
6375 }
6376 else{
6377 # push @bowtie_options,"--qupto $qupto";
6378 }
6379 }
6380
6381 ### QUALITY VALUES
6382 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
6383 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
6384 }
6385 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
6386 # Phred quality values work only when -q is specified
6387 unless ($fastq){
6388 die "Phred quality values works only when -q (FASTQ) is specified\n";
6389 }
6390 if ($bowtie2){
6391 push @bowtie_options,"--phred33";
6392 }
6393 else{
6394 push @bowtie_options,"--phred33-quals";
6395 }
6396 }
6397 if ($phred64){
6398 # Phred quality values work only when -q is specified
6399 unless ($fastq){
6400 die "Phred quality values work only when -q (FASTQ) is specified\n";
6401 }
6402 if ($bowtie2){
6403 push @bowtie_options,"--phred64";
6404 }
6405 else{
6406 push @bowtie_options,"--phred64-quals";
6407 }
6408 }
6409 else{
6410 $phred64 = 0;
6411 }
6412
6413 if ($solexa){
6414 if ($bowtie2){
6415 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
6416 }
6417 # Solexa to Phred value conversion works only when -q is specified
6418 unless ($fastq){
6419 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
6420 }
6421 push @bowtie_options,"--solexa-quals";
6422 }
6423 else{
6424 $solexa = 0;
6425 }
6426
6427 ### ALIGNMENT OPTIONS
6428
6429 ### MISMATCHES
6430 if (defined $mismatches){
6431 if ($bowtie2){
6432 if ($mismatches == 0 or $mismatches == 1){
6433 push @bowtie_options,"-N $mismatches";
6434 }
6435 else{
6436 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
6437 }
6438 }
6439 else{
6440 if ($mismatches >= 0 and $mismatches <= 3){
6441 push @bowtie_options,"-n $mismatches";
6442 }
6443 else{
6444 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
6445 }
6446 }
6447 }
6448 else{
6449 unless ($bowtie2){
6450 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
6451 }
6452 }
6453
6454 ### SEED LENGTH
6455 if (defined $seed_length){
6456 if ($bowtie2){
6457 push @bowtie_options,"-L $seed_length";
6458 }
6459 else{
6460 push @bowtie_options,"-l $seed_length";
6461 }
6462 }
6463
6464 ### MISMATCH CEILING
6465 if (defined $ceiling){
6466 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
6467 push @bowtie_options,"-e $ceiling";
6468 }
6469
6470
6471 ### BOWTIE 2 EFFORT OPTIONS
6472
6473 ### CONSECUTIVE SEED EXTENSION FAILS
6474 if (defined $seed_extension_fails){
6475 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6476 push @bowtie_options,"-D $seed_extension_fails";
6477 }
6478
6479 ### RE-SEEDING REPETITIVE SEEDS
6480 if (defined $reseed_repetitive_seeds){
6481 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6482 push @bowtie_options,"-R $reseed_repetitive_seeds";
6483 }
6484
6485
6486 ### BOWTIE 2 SCORING OPTIONS
6487 if ($score_min){
6488 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6489 unless ($score_min =~ /^L,.+,.+$/){
6490 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
6491 }
6492 push @bowtie_options,"--score-min $score_min";
6493 }
6494 else{
6495 if ($bowtie2){
6496 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
6497 }
6498 }
6499
6500 ### BOWTIE 2 READ GAP OPTIONS
6501 my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend);
6502
6503 if ($rdg){
6504 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6505 if ($rdg =~ /^(\d+),(\d+)$/){
6506 $deletion_open = $1;
6507 $deletion_extend = $2;
6508 }
6509 else{
6510 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
6511 }
6512 push @bowtie_options,"--rdg $rdg";
6513 }
6514 else{
6515 $deletion_open = 5;
6516 $deletion_extend = 3;
6517 }
6518
6519 ### BOWTIE 2 REFERENCE GAP OPTIONS
6520 if ($rfg){
6521 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
6522 if ($rfg =~ /^(\d+),(\d+)$/){
6523 $insertion_open = $1;
6524 $insertion_extend = $2;
6525 }
6526 else{
6527 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
6528 }
6529 push @bowtie_options,"--rfg $rfg";
6530 }
6531 else{
6532 $insertion_open = 5;
6533 $insertion_extend = 3;
6534 }
6535
6536
6537 ### BOWTIE 2 PARALLELIZATION OPTIONS
6538 if (defined $parallel){
6539 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
6540 }
6541 if ($bowtie2){
6542 if ($parallel){
6543 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
6544 push @bowtie_options,"-p $parallel";
6545 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
6546 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
6547 sleep (2);
6548 }
6549 }
6550
6551 ### REPORTING OPTIONS
6552
6553 if ($bowtie2){
6554 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
6555
6556 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
6557 if(defined $most_valid_alignments){
6558
6559 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
6560 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
6561 }
6562 # else{
6563 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
6564 # }
6565 }
6566 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
6567 push @bowtie_options,'-k 2';
6568 }
6569
6570 ### --BEST
6571 if ($bowtie2){
6572 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
6573 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
6574 }
6575 }
6576 else{
6577 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
6578 unless ($best){
6579 push @bowtie_options,'--best';
6580 }
6581 }
6582
6583 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
6584 if ($vanilla){
6585 if ($bowtie2){
6586 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
6587 }
6588 }
6589 else{
6590 $vanilla = 0;
6591 }
6592
6593 ### PAIRED-END MAPPING
6594 if ($mates1){
6595 my @mates1 = (split (/,/,$mates1));
6596 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
6597 my @mates2 = (split(/,/,$mates2));
6598 unless (scalar @mates1 == scalar @mates2){
6599 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
6600 }
6601 while (1){
6602 my $mate1 = shift @mates1;
6603 my $mate2 = shift @mates2;
6604 last unless ($mate1 and $mate2);
6605 push @filenames,"$mate1,$mate2";
6606 }
6607 if ($bowtie2){
6608 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
6609 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
6610 }
6611
6612 if ($old_flag){
6613 warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla);
6614 sleep(3);
6615 }
6616 }
6617 elsif ($mates2){
6618 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
6619 }
6620
6621 ### SINGLE-END MAPPING
6622 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
6623 my $singles;
6624 unless ($mates1 and $mates2){
6625 $singles = join (',',@ARGV);
6626 unless ($singles){
6627 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
6628 }
6629 $singles =~ s/\s/,/g;
6630 @filenames = (split(/,/,$singles));
6631 warn "\nFiles to be analysed:\n";
6632 warn "@filenames\n\n";
6633 sleep (3);
6634 }
6635
6636 ### MININUM INSERT SIZE (PAIRED-END ONLY)
6637 if (defined $minins){
6638 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
6639 push @bowtie_options,"--minins $minins";
6640 }
6641
6642 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
6643 if (defined $maxins){
6644 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
6645 push @bowtie_options,"--maxins $maxins";
6646 }
6647 else{
6648 unless ($singles){
6649 push @bowtie_options,'--maxins 500';
6650 }
6651 }
6652
6653 ### QUIET prints nothing besides alignments (suppresses warnings)
6654 if ($quiet){
6655 push @bowtie_options,'--quiet';
6656 }
6657
6658 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
6659 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
6660 if (defined $chunk){
6661 push @bowtie_options,"--chunkmbs $chunk";
6662 }
6663 else{
6664 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
6665 }
6666 }
6667
6668
6669 ### SUMMARY OF ALL BOWTIE OPTIONS
6670 my $bowtie_options = join (' ',@bowtie_options);
6671
6672
6673 ### STRAND-SPECIFIC LIBRARIES
6674 my $directional;
6675 if ($non_directional){
6676 die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat);
6677 warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n";
6678 sleep (3);
6679 $directional = 0;
6680 }
6681 elsif($pbat){
6682 die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip);
6683 die "The option --pbat is currently not working for Bowtie 2. Please run alignments in default (i.e. Bowtie 1) mode!\n" if ($bowtie2);
6684 die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta);
6685
6686 warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n";
6687 sleep (3);
6688 $directional = 0;
6689 }
6690 else{
6691 warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n";
6692 sleep (3);
6693 $directional = 1; # default behaviour
6694 }
6695
6696 ### UNMAPPED SEQUENCE OUTPUT
6697 $unmapped = 0 unless ($unmapped);
6698
6699 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
6700 $multi_map = 0 unless ($multi_map);
6701
6702
6703 ### OUTPUT DIRECTORY
6704
6705 chdir $parent_dir or die "Failed to move back to current working directory\n";
6706 if ($output_dir){
6707 unless ($output_dir =~ /\/$/){
6708 $output_dir =~ s/$/\//;
6709 }
6710
6711 if (chdir $output_dir){
6712 $output_dir = getcwd; # making the path absolute
6713 unless ($output_dir =~ /\/$/){
6714 $output_dir =~ s/$/\//;
6715 }
6716 }
6717 else{
6718 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
6719 warn "Created output directory $output_dir!\n\n";
6720 chdir $output_dir or die "Failed to move to $output_dir\n";
6721 $output_dir = getcwd; # making the path absolute
6722 unless ($output_dir =~ /\/$/){
6723 $output_dir =~ s/$/\//;
6724 }
6725 }
6726 warn "Output will be written into the directory: $output_dir\n";
6727 }
6728 else{
6729 $output_dir = '';
6730 }
6731
6732 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
6733
6734 chdir $parent_dir or die "Failed to move back to current working directory\n";
6735 if ($temp_dir){
6736 warn "\nUsing temp directory: $temp_dir\n";
6737 unless ($temp_dir =~ /\/$/){
6738 $temp_dir =~ s/$/\//;
6739 }
6740
6741 if (chdir $temp_dir){
6742 $temp_dir = getcwd; # making the path absolute
6743 unless ($temp_dir =~ /\/$/){
6744 $temp_dir =~ s/$/\//;
6745 }
6746 }
6747 else{
6748 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
6749 warn "Created temporary directory $temp_dir!\n\n";
6750 chdir $temp_dir or die "Failed to move to $temp_dir\n";
6751 $temp_dir = getcwd; # making the path absolute
6752 unless ($temp_dir =~ /\/$/){
6753 $temp_dir =~ s/$/\//;
6754 }
6755 }
6756 warn "Temporary files will be written into the directory: $temp_dir\n";
6757 }
6758 else{
6759 $temp_dir = '';
6760 }
6761
6762 ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE
6763 if ($non_bs_mm){
6764 if ($vanilla){
6765 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n";
6766 }
6767 }
6768
6769 ### PREFIX FOR OUTPUT FILES
6770 if ($prefix){
6771 # removing trailing dots
6772
6773 $prefix =~ s/\.+$//;
6774
6775 warn "Using the following prefix for output files: $prefix\n\n";
6776 sleep(1);
6777 }
6778
6779
6780 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag);
6781 }
6782
6783
6784
6785 sub generate_SAM_header{
6786 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
6787 foreach my $chr (keys %chromosomes){
6788 my $length = length ($chromosomes{$chr});
6789 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
6790 }
6791 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
6792 }
6793
6794 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
6795 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
6796
6797 sub single_end_SAM_output{
6798 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
6799 my $strand = $methylation_call_params->{$id}->{alignment_strand};
6800 my $chr = $methylation_call_params->{$id}->{chromosome};
6801 my $start = $methylation_call_params->{$id}->{position};
6802 my $stop = $methylation_call_params->{$id}->{end_position};
6803 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
6804 my $methcall = $methylation_call_params->{$id}->{methylation_call};
6805 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
6806 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
6807 my $number_of_mismatches;
6808 if ($bowtie2){
6809 $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score};
6810 }
6811 else{
6812 $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches};
6813 }
6814
6815 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
6816 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
6817 ## Bit Description Comment Value
6818 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
6819 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
6820 ## 0x4 segment unmapped --- ---
6821 ## 0x8 next segment in the template unmapped --- ---
6822 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
6823 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
6824 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
6825 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
6826 ## 0x100 secondary alignment --- ---
6827 ## 0x200 not passing quality controls --- ---
6828 ## 0x400 PCR or optical duplicate --- ---
6829
6830 #####
6831
6832 my $flag; # FLAG variable used for SAM format.
6833 if ($strand eq "+"){
6834 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
6835 $flag = 0; # 0 for "+" strand (OT)
6836 }
6837 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
6838 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
6839 }
6840 else{
6841 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
6842 }
6843 }
6844 elsif ($strand eq "-"){
6845 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
6846 $flag = 16; # 16 for "-" strand (OB)
6847 }
6848 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
6849 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
6850 }
6851 else{
6852 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
6853 }
6854 }
6855 else{
6856 die "Unexpected strand information: $strand\n\n";
6857 }
6858
6859 #####
6860
6861 my $mapq = 255; # Assume mapping quality is unavailable
6862
6863 #####
6864
6865 my $cigar;
6866 if ($bowtie2){
6867 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
6868 }
6869 else{
6870 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
6871 }
6872
6873 #####
6874
6875 my $rnext = "*"; # Paired-end variable
6876
6877 #####
6878
6879 my $pnext = 0; # Paired-end variable
6880
6881 #####
6882
6883 my $tlen = 0; # Paired-end variable
6884
6885 #####
6886
6887 if ($read_conversion eq 'CT'){
6888 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
6889 }
6890 else{
6891 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
6892 }
6893
6894 if ($strand eq '-'){
6895 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
6896 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
6897 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
6898 }
6899
6900 #####
6901
6902 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
6903 # into the reference string. hemming_dist()
6904 if ($bowtie2){
6905 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
6906 }
6907
6908 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
6909
6910 #####
6911
6912 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
6913
6914 #####
6915
6916 my $XM_tag; # Optional tag XM: Methylation Call String
6917 if ($strand eq '+'){
6918 $XM_tag = "XM:Z:$methcall";
6919 }
6920 elsif ($strand eq '-'){
6921 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
6922 }
6923
6924 #####
6925
6926 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
6927
6928 #####
6929
6930 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
6931
6932 #####
6933
6934 # Optionally calculating number of mismatches for Bowtie 2 alignments
6935
6936 if ($non_bs_mm) {
6937 if ($bowtie2) {
6938
6939 $number_of_mismatches =~ s/-//; # removing the minus sign
6940
6941 ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches
6942 if ($cigar =~ /(D|I)/) {
6943 # warn "$cigar\n";
6944
6945 # parsing CIGAR string
6946 my @len = split (/\D+/,$cigar); # storing the length per operation
6947 my @ops = split (/\d+/,$cigar); # storing the operation
6948 shift @ops; # remove the empty first element
6949 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
6950
6951 foreach (0..$#len) {
6952 if ($ops[$_] eq 'M') {
6953 # warn "skipping\n";
6954 next; # irrelevant
6955 }
6956 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
6957 $number_of_mismatches -= $insertion_open;
6958 $number_of_mismatches -= $len[$_] * $insertion_extend;
6959 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
6960 }
6961 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
6962 $number_of_mismatches -= $deletion_open;
6963 $number_of_mismatches -= $len[$_] * $deletion_extend;
6964 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
6965 }
6966 elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
6967 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
6968 }
6969 else {
6970 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
6971 }
6972 }
6973 # warn "Alignment score $number_of_mismatches\n";
6974 # print "Mismatches $number_of_mismatches\n\n";
6975 }
6976 ### Now we have InDel corrected alignment scores
6977
6978 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
6979 ### sequence contained more than 5 Ns, but this should occur close to never
6980
6981 my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division
6982 # warn "N count: $seq_N_count\n";
6983 $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count;
6984 # warn "MM $number_of_mismatches\n";
6985 }
6986 }
6987
6988 ####
6989
6990 my $XA_tag = "XA:Z:$number_of_mismatches";
6991
6992 #####
6993
6994 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
6995 ### optionally print number of non-bisulfite mismatches
6996 if ($non_bs_mm){
6997 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n";
6998 }
6999 else{ # default
7000 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
7001 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
7002 }
7003 }
7004
7005 sub paired_end_SAM_output{
7006 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
7007 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
7008 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
7009 my $chr = $methylation_call_params->{$id}->{chromosome};
7010 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
7011 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
7012 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
7013 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
7014 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
7015 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
7016 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
7017
7018 my $id_1;
7019 my $id_2;
7020
7021 if ($old_flag){
7022 $id_1 = $id.'/1';
7023 $id_2 = $id.'/2';
7024 }
7025 else{
7026 $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard
7027 $id_2 = $id;
7028 }
7029
7030 # Allows all degenerate nucleotide sequences in reference genome
7031 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
7032 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
7033
7034 my $index; # used to store the srand origin of the alignment in a less convoluted way
7035
7036 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
7037 $index = 0; ## this is OT (original top strand)
7038 }
7039 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
7040 $index = 1; ## this is CTOB (complementary to OB)
7041 }
7042 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
7043 $index = 2; ## this is CTOT (complementary to OT)
7044 }
7045 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
7046 $index = 3; ## this is OB (original bottom)
7047 }
7048 else {
7049 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
7050 }
7051
7052 my $number_of_mismatches_1;
7053 my $number_of_mismatches_2;
7054
7055 if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine
7056 $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default!
7057 $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2};
7058 }
7059 else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation
7060 if ($index == 2 or $index == 3){ # CTOT or OB
7061 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default!
7062 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1};
7063 }
7064 else{ # if the first read aligned in forward direction it is like for Bowtie 2
7065 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
7066 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
7067 }
7068 }
7069
7070
7071
7072 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
7073 ### first or last position.
7074
7075 if ($index == 0 or $index == 3){ # OT or OB
7076 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
7077 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
7078 }
7079 else{ # CTOT or CTOB
7080 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
7081 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
7082 }
7083
7084 #####
7085
7086 my $start_read_1;
7087 my $start_read_2;
7088 # adjusting end positions
7089
7090 if ($bowtie2){
7091 $start_read_1 = $methylation_call_params->{$id}->{position_1};
7092 $start_read_2 = $methylation_call_params->{$id}->{position_2};
7093 }
7094 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
7095 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
7096 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
7097 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
7098 }
7099 else{ # read 1 is on the - strand
7100 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
7101 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
7102 }
7103 }
7104
7105 #####
7106
7107 my $end_read_1;
7108 my $end_read_2;
7109 # adjusting end positions
7110
7111 if ($bowtie2){
7112 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
7113 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
7114 }
7115 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
7116 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
7117 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
7118 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
7119 }
7120 else{
7121 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
7122 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
7123 }
7124 }
7125
7126 #####
7127
7128 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
7129 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
7130 ## Bit Description Comment Value
7131 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
7132 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
7133 ## 0x4 segment unmapped --- ---
7134 ## 0x8 next segment in the template unmapped --- ---
7135 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
7136 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
7137 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
7138 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
7139 ## 0x100 secondary alignment --- ---
7140 ## 0x200 not passing quality controls --- ---
7141 ## 0x400 PCR or optical duplicate --- ---
7142
7143 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
7144
7145 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
7146 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
7147
7148 my $flag_1; # FLAG variable used for SAM format
7149 my $flag_2;
7150
7151 ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH)
7152
7153 if ($index == 0){ # OT
7154 unless ($old_flag){
7155 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
7156 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
7157 }
7158 else{
7159 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
7160 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
7161 }
7162 }
7163 elsif ($index == 1){ # CTOB
7164 unless($old_flag){
7165 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
7166 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
7167 }
7168 else{
7169 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
7170 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
7171 }
7172 }
7173 elsif ($index == 2){ # CTOT
7174 unless ($old_flag){
7175 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
7176 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
7177 }
7178 else{
7179 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
7180 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
7181 }
7182 }
7183 elsif ($index == 3){ # OB
7184 unless ($old_flag){
7185 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
7186 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
7187 }
7188 else{
7189 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
7190 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
7191 }
7192 }
7193
7194 #####
7195
7196 my $mapq = 255; # Mapping quality is unavailable
7197
7198 #####
7199
7200 my $cigar_1;
7201 my $cigar_2;
7202
7203 if ($bowtie2){
7204 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
7205 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
7206 }
7207 else{
7208 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
7209 $cigar_2 = length($actual_seq_2) . "M";
7210 }
7211
7212 #####
7213
7214 my $rnext = '='; # Chromosome of mate; applies to both reads
7215
7216 #####
7217
7218 my $pnext_1 = $start_read_2; # Leftmost position of mate
7219 my $pnext_2 = $start_read_1;
7220
7221 #####
7222
7223 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
7224 my $tlen_2;
7225
7226 if ($bowtie2){
7227
7228 if ($start_read_1 <= $start_read_2){
7229
7230 # Read 1 alignment is leftmost
7231
7232 if ($end_read_2 >= $end_read_1){
7233
7234 # -------------------------> read 1 reads overlapping
7235 # <------------------------- read 2
7236 #
7237 # or
7238 #
7239 # -------------------------> read 1
7240 # <----------------------- read 2 read 2 contained within read 1
7241 #
7242 # or
7243 #
7244 # -------------------------> read 1 reads 1 and 2 exactly overlapping
7245 # <------------------------- read 2
7246 #
7247
7248 # dovetailing of reads is not enabled for Bowtie 2 alignments
7249
7250 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
7251 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
7252 }
7253 elsif ($end_read_2 < $end_read_1){
7254
7255 # -------------------------> read 1
7256 # <----------- read 2 read 2 contained within read 1
7257 #
7258 # or
7259 #
7260 # -------------------------> read 1
7261 # <------------------------ read 2 read 2 contained within read 1
7262
7263 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable
7264 $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign,
7265 $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this
7266 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
7267 }
7268
7269 }
7270
7271 elsif ($start_read_2 < $start_read_1){
7272
7273 if ($end_read_1 >= $end_read_2){
7274
7275 # Read 2 alignment is leftmost
7276
7277 # -------------------------> read 2 reads overlapping
7278 # <------------------------- read 1
7279 #
7280 # or
7281 #
7282 # -------------------------> read 2
7283 # <----------------------- read 1 read 1 contained within read 2
7284 #
7285 #
7286
7287 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
7288 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
7289 }
7290 elsif ($end_read_1 < $end_read_2){
7291
7292 # -------------------------> read 2
7293 # <----------- read 1 read 1 contained within read 2
7294 #
7295 # or
7296 #
7297 # -------------------------> read 2
7298 # <------------------------ read 1 read 1 contained within read 2
7299
7300 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable
7301 $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign,
7302 $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this
7303 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
7304 }
7305 }
7306 }
7307
7308 else{ # Bowtie 1
7309
7310 if ($end_read_2 >= $end_read_1){
7311 # Read 1 alignment is leftmost
7312 # -------------------------> read 1
7313 # <------------------------- read 2
7314 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
7315
7316 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
7317 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
7318 }
7319 else{
7320 # Read 2 alignment is leftmost
7321 # -------------------------> read 2
7322 # <------------------------- read 1
7323 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
7324
7325 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
7326 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
7327 }
7328 }
7329
7330 #####
7331
7332 # adjusting the strand of the sequence before we use them to generate mismatch strings
7333 if ($strand_1 eq '-'){
7334 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
7335 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
7336 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
7337 }
7338 if ($strand_2 eq '-'){
7339 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
7340 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
7341 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
7342 }
7343
7344 # print "$actual_seq_1\n$ref_seq_1\n\n";
7345 # print "$actual_seq_2\n$ref_seq_2\n\n";
7346
7347 #####
7348
7349 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
7350 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
7351 if ($bowtie2){
7352 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
7353 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
7354 }
7355 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
7356 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
7357
7358 #####
7359
7360 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
7361 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
7362
7363 #####
7364
7365 my $XM_tag_1; # Optional tag XM: Methylation call string
7366 my $XM_tag_2;
7367
7368 if ($strand_1 eq '-'){
7369 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
7370 }
7371 else{
7372 $XM_tag_1 = "XM:Z:$methcall_1";
7373 }
7374
7375 if ($strand_2 eq '-'){
7376 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
7377 }
7378 else{
7379 $XM_tag_2 = "XM:Z:$methcall_2";
7380 }
7381
7382 #####
7383
7384 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
7385 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
7386
7387 #####
7388
7389 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
7390
7391 #####
7392
7393 # Optionally calculating number of mismatches for Bowtie 2 alignments
7394
7395 if ($non_bs_mm) {
7396 if ($bowtie2) {
7397
7398 $number_of_mismatches_1 =~ s/-//; # removing the minus sign
7399 $number_of_mismatches_2 =~ s/-//;
7400
7401 ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches
7402
7403 ### CIGAR 1
7404 if ($cigar_1 =~ /(D|I)/) {
7405 # warn "$cigar_1\n";
7406
7407 # parsing CIGAR string
7408 my @len = split (/\D+/,$cigar_1); # storing the length per operation
7409 my @ops = split (/\d+/,$cigar_1); # storing the operation
7410 shift @ops; # remove the empty first element
7411 die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
7412
7413 foreach (0..$#len) {
7414 if ($ops[$_] eq 'M') {
7415 # warn "skipping\n";
7416 next; # irrelevant
7417 }
7418 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
7419 $number_of_mismatches_1 -= $insertion_open;
7420 $number_of_mismatches_1 -= $len[$_] * $insertion_extend;
7421 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
7422 }
7423 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
7424 $number_of_mismatches_1 -= $deletion_open;
7425 $number_of_mismatches_1 -= $len[$_] * $deletion_extend;
7426 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
7427 }
7428 elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
7429 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
7430 }
7431 else {
7432 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
7433 }
7434 }
7435
7436 # warn "Alignment score $number_of_mismatches_1\n";
7437 # print "Mismatches $number_of_mismatches_1\n\n";
7438 }
7439
7440 ### CIGAR 2
7441 if ($cigar_2 =~ /(D|I)/) {
7442 # warn "$cigar_2\n";
7443
7444 # parsing CIGAR string
7445 my @len = split (/\D+/,$cigar_2); # storing the length per operation
7446 my @ops = split (/\d+/,$cigar_2); # storing the operation
7447 shift @ops; # remove the empty first element
7448 die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
7449
7450 foreach (0..$#len) {
7451 if ($ops[$_] eq 'M') {
7452 # warn "skipping\n";
7453 next; #irrelevant
7454 }
7455 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
7456 $number_of_mismatches_2 -= $insertion_open;
7457 $number_of_mismatches_2 -= $len[$_] * $insertion_extend;
7458 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
7459 }
7460 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
7461 $number_of_mismatches_2 -= $deletion_open;
7462 $number_of_mismatches_2 -= $len[$_] * $deletion_extend;
7463 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
7464 }
7465 elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
7466 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
7467 }
7468 else {
7469 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
7470 }
7471 }
7472 }
7473
7474 ### Now we have InDel corrected Alignment scores
7475
7476 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
7477 ### sequence contained more than 5 Ns, but this should occur close to never
7478
7479 my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division
7480 my $seq_2_N_count = $number_of_mismatches_2 % 6;
7481 # warn "N count 1: $seq_1_N_count\n";
7482 # warn "N count 2: $seq_2_N_count\n";
7483
7484 $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count;
7485 $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count;
7486
7487 # warn "MM1 $number_of_mismatches_1 \n";
7488 # warn "MM2 $number_of_mismatches_2 \n";
7489 }
7490 }
7491
7492 ####
7493
7494 my $XA_tag = "XA:Z:$number_of_mismatches_1";
7495 my $XB_tag = "XB:Z:$number_of_mismatches_2";
7496
7497
7498 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
7499 ### optionally print number of non-bisulfite mismatches
7500 if ($non_bs_mm){
7501 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n";
7502 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n";
7503 }
7504 else{ # default
7505 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
7506 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
7507 }
7508 }
7509
7510 sub revcomp{
7511 my $seq = shift or die "Missing seq to reverse complement\n";
7512 $seq = reverse $seq;
7513 $seq =~ tr/ACTGactg/TGACTGAC/;
7514 return $seq;
7515 }
7516
7517 sub hemming_dist{
7518 my $matches = 0;
7519 my @actual_seq = split //,(shift @_);
7520 my @ref_seq = split //,(shift @_);
7521 foreach (0..$#actual_seq){
7522 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
7523 }
7524 return my $hd = scalar @actual_seq - $matches;
7525 }
7526
7527 sub make_mismatch_string{
7528 my $actual_seq = shift or die "Missing actual sequence";
7529 my $ref_seq = shift or die "Missing reference sequence";
7530 my $XX_tag = "XX:Z:";
7531 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
7532 my $prev_mm_pos = 0;
7533 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
7534 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
7535 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
7536 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
7537 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
7538 $prev_mm_pos = pos($tmp); # Position of last mismatch
7539 }
7540 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
7541 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
7542 return $XX_tag;
7543 }
7544
7545
7546
7547 sub print_helpfile{
7548 print << "HOW_TO";
7549
7550
7551 This program is free software: you can redistribute it and/or modify
7552 it under the terms of the GNU General Public License as published by
7553 the Free Software Foundation, either version 3 of the License, or
7554 (at your option) any later version.
7555
7556 This program is distributed in the hope that it will be useful,
7557 but WITHOUT ANY WARRANTY; without even the implied warranty of
7558 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
7559 GNU General Public License for more details.
7560 You should have received a copy of the GNU General Public License
7561 along with this program. If not, see <http://www.gnu.org/licenses/>.
7562
7563
7564
7565 DESCRIPTION
7566
7567
7568 The following is a brief description of command line options and arguments to control the Bismark
7569 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
7570 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
7571 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
7572 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
7573 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
7574 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
7575 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
7576 sequence from the genome and determine if there were any protected C's present or not.
7577
7578 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
7579 re-enabled by using --non_directional.
7580
7581 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
7582 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
7583 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
7584
7585
7586 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
7587
7588
7589 ARGUMENTS:
7590
7591 <genome_folder> The path to the folder containing the unmodified reference genome
7592 as well as the subfolders created by the Bismark_Genome_Preparation
7593 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
7594 Bismark expects one or more fastA files in this folder (file extension: .fa
7595 or .fasta). The path can be relative or absolute.
7596
7597 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
7598 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
7599 correspond file-for-file and read-for-read with those specified in <mates2>.
7600 Reads may be a mix of different lengths. Bismark will produce one mapping result
7601 and one report file per paired-end input file pair.
7602
7603 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
7604 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
7605 correspond file-for-file and read-for-read with those specified in <mates1>.
7606 Reads may be a mix of different lengths.
7607
7608 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
7609 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
7610 produce one mapping result and one report file per input file.
7611
7612
7613 OPTIONS:
7614
7615
7616 Input:
7617
7618 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
7619 files (usually having extension .fg or .fastq). This is the default. See also
7620 --solexa-quals.
7621
7622 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
7623 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
7624 are assumed to be 40 on the Phred scale. FASTA files are expected to contain both
7625 the read name and the sequence on a single line (and not spread over several lines).
7626
7627 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
7628
7629 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
7630
7631 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
7632
7633 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
7634
7635 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
7636 (which can't). The formula for conversion is:
7637 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
7638 is usually the right option for use with (unconverted) reads emitted by the GA
7639 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
7640
7641 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
7642 reads emitted by GA Pipeline version 1.3 or later. Default: off.
7643
7644 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
7645 specified it is assumed that Bowtie (1 or 2) is in the PATH.
7646
7647
7648 Alignment:
7649
7650 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
7651 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
7652 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
7653
7654 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
7655 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
7656 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
7657
7658 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
7659 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
7660 quality values to the nearest 10 and saturates at 30. This value is not relevant for
7661 Bowtie 2.
7662
7663 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
7664 --best mode. Best-first search must keep track of many paths at once to ensure it is
7665 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
7666 memory impact of the descriptors, but they can still grow very large in some cases. If
7667 you receive an error message saying that chunk memory has been exhausted in --best mode,
7668 try adjusting this parameter up to dedicate more memory to the descriptors. This value
7669 is not relevant for Bowtie 2. Default: 512.
7670
7671 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
7672 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
7673 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
7674 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
7675
7676 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
7677 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
7678 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
7679 A 61-bp gap would not be valid in that case. Default: 500.
7680
7681
7682 Bowtie 1 Reporting:
7683
7684 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
7685 will be used by default.
7686
7687 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
7688 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
7689 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
7690 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
7691 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
7692 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
7693 the best alignment). --best mode also removes all strand bias. Note that --best does not
7694 affect which alignments are considered "valid" by Bowtie, only which valid alignments
7695 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
7696 Default: on.
7697
7698 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
7699 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
7700
7701
7702 Output:
7703
7704 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
7705 bisulfite strands will be reported. Default: OFF.
7706
7707 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
7708 to the original strands are merely theoretical and should not exist in reality. Specifying directional
7709 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
7710 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
7711 for sprand-specific libraries).
7712
7713 --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al.,
7714 PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode,
7715 as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT
7716 and OB ones. Use this option only if you are certain that your libraries were constructed following
7717 a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option
7718 --pbat works only for single-end and paired-end FastQ files for use with Bowtie1 (uncompressed
7719 temporary files only).
7720
7721 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
7722 split up into several smaller files to run concurrently and the output files are to be merged.
7723
7724 --quiet Print nothing besides alignments.
7725
7726 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
7727 of SAM format output.
7728
7729 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
7730 appear as they did in the input, without any translation of quality values that may have
7731 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
7732 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
7733 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
7734 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
7735
7736 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
7737 mismatches or other reads that fail to align uniquely to a file in the output directory.
7738 Written reads will appear as they did in the input, without any of the translation of quality
7739 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
7740 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
7741 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
7742
7743 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
7744 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
7745 to create it first. The path to the output folder can be either relative or absolute.
7746
7747 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
7748 the specified folder does not exist, Bismark will attempt to create it first. The path to the
7749 temporary folder can be either relative or absolute.
7750
7751 --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the
7752 alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is
7753 just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions.
7754 The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches'
7755 and 'XB:Z:number of mismatches' for read 2 of paired-end reads.
7756
7757 --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk
7758 space. This option is available for most alignment modes but is not available for paired-end FastA
7759 files. This option might be somewhat slower than writing out uncompressed files, but this awaits
7760 further testing.
7761
7762 --bam The output will be written out in BAM format instead of the default SAM format. Bismark will
7763 attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't
7764 been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,
7765 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file).
7766
7767 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
7768 explicitly if Samtools is in the PATH already.
7769
7770 --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For
7771 example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc.
7772
7773 --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition,
7774 this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since
7775 both the appended read IDs and custom FLAG values may cause problems with some downstream tools
7776 such as Picard, new defaults were implemented as of version 0.8.3.
7777
7778
7779 default old_flag
7780 =================== ===================
7781 Read 1 Read 2 Read 1 Read 2
7782
7783 OT: 99 147 67 131
7784
7785 OB: 83 163 115 179
7786
7787 CTOT: 99 147 67 131
7788
7789 CTOB: 83 163 115 179
7790
7791
7792
7793 Other:
7794
7795 -h/--help Displays this help file.
7796
7797 -v/--version Displays version information.
7798
7799
7800 BOWTIE 2 SPECIFIC OPTIONS
7801
7802 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
7803 alignments, i.e. searches for alignments involving all read characters (also called
7804 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
7805 and/or quality trimmed where appropriate. Default: off.
7806
7807 Bowtie 2 alignment options:
7808
7809 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
7810 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
7811 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
7812 Bowtie 1 see -n).
7813
7814 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
7815 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
7816 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
7817 Bowtie 1 see -l).
7818
7819 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
7820 position to be the highest possible, regardless of the actual value. I.e. input is treated
7821 as though all quality values are high. This is also the default behavior when the input
7822 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
7823
7824
7825 Bowtie 2 paired-end options:
7826
7827 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
7828 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
7829 and on by default.
7830
7831 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
7832 A discordant alignment is an alignment where both mates align uniquely, but that does not
7833 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
7834 and it is on by default.
7835
7836
7837 Bowtie 2 effort options:
7838
7839 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
7840 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
7841 new second-best alignment. Default: 15.
7842
7843 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
7844 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
7845 mismatches allowed) at different offsets and searches for more alignments. A read is considered
7846 to have repetitive seeds if the total number of seed hits divided by the number of seeds
7847 that aligned at least once is greater than 300. Default: 2.
7848
7849 Bowtie 2 parallelization options:
7850
7851
7852 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
7853 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
7854 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
7855 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
7856 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
7857 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
7858 automatically use the option '--reorder', which guarantees that output SAM records are printed in
7859 an order corresponding to the order of the reads in the original input file, even when -p is set
7860 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
7861 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
7862 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
7863 correspond to input order in that case.
7864
7865 Bowtie 2 Scoring options:
7866
7867 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
7868 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
7869 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
7870 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
7871 L,0,-0.2.
7872
7873 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty
7874 of <int1> + N * <int2>. Default: 5, 3.
7875
7876 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets
7877 a penalty of <int1> + N * <int2>. Default: 5, 3.
7878
7879
7880 Bowtie 2 Reporting options:
7881
7882 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
7883 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
7884 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
7885 effort expended to find valid alignments.
7886
7887 For reference, this used to be the old (now deprecated) description of -M:
7888 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
7889 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
7890 happens first. Only the best alignment is reported. Information from the other alignments is used to
7891 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
7892 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
7893 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
7894 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
7895 always used and its default value is set to 10.
7896
7897
7898 'VANILLA' Bismark OUTPUT:
7899
7900 Single-end output format (tab-separated):
7901
7902 (1) <seq-ID>
7903 (2) <read alignment strand>
7904 (3) <chromosome>
7905 (4) <start position>
7906 (5) <end position>
7907 (6) <observed bisulfite sequence>
7908 (7) <equivalent genomic sequence>
7909 (8) <methylation call>
7910 (9) <read conversion
7911 (10) <genome conversion>
7912 (11) <read quality score (Phred33)>
7913
7914
7915 Paired-end output format (tab-separated):
7916 (1) <seq-ID>
7917 (2) <read 1 alignment strand>
7918 (3) <chromosome>
7919 (4) <start position>
7920 (5) <end position>
7921 (6) <observed bisulfite sequence 1>
7922 (7) <equivalent genomic sequence 1>
7923 (8) <methylation call 1>
7924 (9) <observed bisulfite sequence 2>
7925 (10) <equivalent genomic sequence 2>
7926 (11) <methylation call 2>
7927 (12) <read 1 conversion
7928 (13) <genome conversion>
7929 (14) <read 1 quality score (Phred33)>
7930 (15) <read 2 quality score (Phred33)>
7931
7932
7933 Bismark SAM OUTPUT (default):
7934
7935 (1) QNAME (seq-ID)
7936 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
7937 (3) RNAME (chromosome)
7938 (4) POS (start position)
7939 (5) MAPQ (always 255)
7940 (6) CIGAR
7941 (7) RNEXT
7942 (8) PNEXT
7943 (9) TLEN
7944 (10) SEQ
7945 (11) QUAL (Phred33 scale)
7946 (12) NM-tag (edit distance to the reference)
7947 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
7948 (14) XM-tag (methylation call string)
7949 (15) XR-tag (read conversion state for the alignment)
7950 (16) XG-tag (genome conversion state for the alignment)
7951 (17) XA/XB-tag (non-bisulfite mismatches) (optional!)
7952
7953 Each read of paired-end alignments is written out in a separate line in the above format.
7954
7955
7956 Last edited on 07 October 2013.
7957
7958 HOW_TO
7959 }