comparison new/bismark @ 7:fcadce4d9a06 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author bgruening
date Sat, 06 May 2017 13:18:09 -0400
parents
children
comparison
equal deleted inserted replaced
6:0f8646f22b8d 7:fcadce4d9a06
1 #!/usr/bin/perl --
2 use strict;
3 use warnings;
4 use IO::Handle;
5 use Cwd;
6 $|++;
7 use Getopt::Long;
8
9
10 ## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk)
11
12 ## This program is free software: you can redistribute it and/or modify
13 ## it under the terms of the GNU General Public License as published by
14 ## the Free Software Foundation, either version 3 of the License, or
15 ## (at your option) any later version.
16
17 ## This program is distributed in the hope that it will be useful,
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ## GNU General Public License for more details.
21
22 ## You should have received a copy of the GNU General Public License
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
24
25
26 my $parent_dir = getcwd;
27 my $bismark_version = 'v0.14.3';
28 my $command_line = join (" ",@ARGV);
29
30
31 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
32 foreach my $arg (@ARGV){
33 if ($arg eq '--solexa1.3-quals'){
34 $arg = '--phred64-quals';
35 }
36 }
37 my @filenames; # will be populated by processing the command line
38
39 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index,$multicore) = process_command_line();
40
41 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
42 my %chromosomes; # stores the chromosome sequences of the mouse genome
43 my %SQ_order; # stores the order of sequences in the reference. This is to produce SAM/BAM files with a known order of chromosomes
44 my %counting; # counting various events
45 my @pids; # storing the process IDs of child processes in parallel mode
46
47
48 my $seqID_contains_tabs;
49 my $verbose = 0;
50
51 if ($multicore > 1){
52 warn "Running Bismark Parallel version. Number of parallel instances to be spawned: $multicore\n\n";
53 }
54
55
56 sub multi_process_handling{
57
58 my $offset = 1;
59 my $process_id;
60 if ($multicore > 1){
61
62 until ($offset == $multicore){
63 # warn "multicore: $multicore\noffset: $offset\n";
64 my $fork = fork;
65
66 if (defined $fork){
67 if ($fork != 0){
68 $process_id = $fork;
69 push @pids, $process_id;
70 if ($offset < $multicore){
71 ++$offset;
72 # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n";
73 }
74 else{
75 # warn "Reached the number of maximum multicores. Proceeeding to processing...\n";
76 }
77 }
78 elsif ($fork == 0){
79 # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n";
80 $process_id = $fork;
81 last;
82 }
83 }
84 else{
85 die "Forking unsuccessful. Proceeding using a single thread only\n";
86 }
87 }
88
89 # warn "\nThe Thread Identity\n===================\n";
90 if ($process_id){
91 # print "I am the parent process. My children are called:\n";
92 # print join ("\t",@pids),"\n";
93 # print "I am going to process the following line count: $offset\n\n";
94 }
95 elsif($process_id == 0){
96 # warn "I am a child process: Process ID: $process_id\n";
97 # warn "I am going to process the following line count: $offset\n\n";
98 }
99 else{
100 die "Process ID was: '$process_id'\n";
101 }
102 }
103 else{
104 warn "Single-core mode: setting pid to 1\n";
105 $process_id = 1;
106 }
107
108 return ($process_id,$offset);
109 }
110
111
112 sub subset_input_file_FastQ{
113
114 my ($filename,$process_id,$offset) = @_;
115
116 if ($filename =~ /gz$/){
117 open (OFFSET,"zcat $filename |") or die "Couldn't read from file '$filename': $!\n";
118 }
119 else{
120 open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n";
121 }
122
123 # warn "offset is $offset\n";
124 my $temp = $filename;
125 $temp .= ".temp.$offset";
126 $temp =~ s/^.*\///; # replacing everything upto and including the last /, i.e. removing file path information
127
128 if ($gzip){
129 $temp .= '.gz';
130 open (TEMPFQ,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n";
131 }
132 else{
133 open (TEMPFQ,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n";
134 }
135
136 my $line_count = 0;
137
138 while (1){
139 my $l1 = <OFFSET>;
140 my $l2 = <OFFSET>;
141 my $l3 = <OFFSET>;
142 my $l4 = <OFFSET>;
143
144 last unless ($l4);
145 ++$line_count;
146
147 if ( ($line_count - $offset)%$multicore == 0){
148 # warn "line count: $line_count\noffset: $offset\n";
149 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
150 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
151 print TEMPFQ "$l1$l2$l3$l4";
152 }
153 else{
154 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
155 next;
156 }
157 }
158
159 close OFFSET or warn $!;
160 close TEMPFQ or warn "Failed to close file handle TEMPFQ: $!\n";
161
162 warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n";
163
164 return ($temp); # returning the subset filename
165
166 }
167
168 sub subset_input_file_FastA{
169
170 my ($filename,$process_id,$offset) = @_;
171
172 if ($filename =~ /gz$/){
173 open (OFFSET,"zcat $filename |") or die "Couldn't read from file '$filename': $!\n";
174 }
175 else{
176 open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n";
177 }
178
179 # warn "offset is $offset\n";
180 my $temp = $filename;
181 $temp .= ".temp.$offset";
182
183 if ($gzip){
184 $temp .= '.gz';
185 open (TEMPFQ,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n";
186 }
187 else{
188 open (TEMPFQ,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n";
189 }
190
191 warn "Writing temporary infile to $temp\n";
192
193 my $line_count = 0;
194
195 while (1){
196 my $l1 = <OFFSET>;
197 my $l2 = <OFFSET>;
198
199 last unless ($l2);
200 ++$line_count;
201
202 if ( ($line_count - $offset)%$multicore == 0){
203 # warn "line count: $line_count\noffset: $offset\n";
204 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
205 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
206 print TEMPFQ "$l1$l2";
207 }
208 else{
209 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
210 next;
211 }
212 }
213
214 close OFFSET or warn $!;
215 close TEMPFQ or warn "Failed to close file handle TEMPFQ: $!\n";
216
217 warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n";
218
219 return ($temp); # returning the subset filename
220
221 }
222
223 #####
224 #####
225
226 foreach my $filename (@filenames){
227
228 my $original_filename = $filename;
229 my $original_filename_1;
230 my $original_filename_2;
231
232 chdir $parent_dir or die "Unable to move to initial working directory'$parent_dir' $!\n";
233 ### resetting the counting hash and fhs
234 reset_counters_and_fhs($filename);
235 @pids = ();
236 $seqID_contains_tabs = 0;
237
238 ### As of version 0.14.0 we support multi-threading. In a first instance we accomplish this by
239 ### splitting the input file(s) into several smaller subfiles and merging the results back at
240 ### the end.
241
242 # get general settings (also for single-threaded use)
243 my ($pid,$offset) = multi_process_handling ();
244
245 my ($single_end,$paired_end);
246 ### PAIRED-END ALIGNMENTS
247 if ($filename =~ ','){
248
249 $single_end = 0;
250 $paired_end = 1;
251
252 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
253
254 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
255 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
256 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
257 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
258 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
259
260 my ($filename_1,$filename_2) = (split (/,/,$filename));
261 $original_filename_1 = $filename_1;
262 $original_filename_2 = $filename_2;
263
264 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
265
266 ### subsetting the input file(s)
267 unless ($multicore == 1){ # not needed in single-core mode
268 # warn "My PID: $pid\nMy offset: $offset\n";
269 if ($sequence_file_format eq 'FASTA'){
270 my $temp_filename_1 = subset_input_file_FastA($filename_1,$pid,$offset);
271 warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n";
272 $filename_1 = "${temp_dir}$temp_filename_1";
273
274 my $temp_filename_2 = subset_input_file_FastA($filename_2,$pid,$offset);
275 warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n";
276 $filename_2 = "${temp_dir}$temp_filename_2";
277 }
278 else{ # FastQ format, default
279 my $temp_filename_1 = subset_input_file_FastQ($filename_1,$pid,$offset);
280 warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n";
281 $filename_1 = "${temp_dir}$temp_filename_1";
282
283 my $temp_filename_2 = subset_input_file_FastQ($filename_2,$pid,$offset);
284 warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n";
285 $filename_2 = "${temp_dir}$temp_filename_2";
286 }
287 }
288
289 ### additional variables only for paired-end alignments
290 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
291
292 ### FastA format
293 if ($sequence_file_format eq 'FASTA'){
294 warn "Input files are in FastA format\n";
295
296 if ($directional){
297 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
298 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
299
300 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
301 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
302 $fhs[1]->{inputfile_1} = undef;
303 $fhs[1]->{inputfile_2} = undef;
304 $fhs[2]->{inputfile_1} = undef;
305 $fhs[2]->{inputfile_2} = undef;
306 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
307 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
308 }
309 else{
310 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
311 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
312
313 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
314 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
315 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
316 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
317 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
318 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
319 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
320 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
321 }
322
323 if ($bowtie2){
324 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
325 }
326 else{
327 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
328 }
329 }
330
331 ### FastQ format
332 else{
333 warn "Input files are in FastQ format\n";
334 if ($directional){
335 if ($bowtie2){
336 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
337 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
338
339 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
340 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
341 $fhs[1]->{inputfile_1} = undef;
342 $fhs[1]->{inputfile_2} = undef;
343 $fhs[2]->{inputfile_1} = undef;
344 $fhs[2]->{inputfile_2} = undef;
345 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
346 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
347 }
348 else{ # Bowtie 1 alignments
349 if ($gzip){
350 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
351
352 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
353 $fhs[0]->{inputfile_2} = undef; # no longer needed
354 $fhs[1]->{inputfile_1} = undef;
355 $fhs[1]->{inputfile_2} = undef;
356 $fhs[2]->{inputfile_1} = undef;
357 $fhs[2]->{inputfile_2} = undef;
358 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format
359 $fhs[3]->{inputfile_2} = undef; # no longer needed
360 }
361 else{
362 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
363 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
364
365 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
366 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
367 $fhs[1]->{inputfile_1} = undef;
368 $fhs[1]->{inputfile_2} = undef;
369 $fhs[2]->{inputfile_1} = undef;
370 $fhs[2]->{inputfile_2} = undef;
371 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
372 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
373 }
374 }
375 }
376 elsif($pbat){ # PBAT-Seq. This works for both Bowtie and Bowtie 2
377 ### At the moment we are only performing alignments only with uncompressed FastQ files
378 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
379 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
380
381 $fhs[0]->{inputfile_1} = undef;
382 $fhs[0]->{inputfile_2} = undef;
383 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
384 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
385 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
386 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
387 $fhs[3]->{inputfile_1} = undef;
388 $fhs[3]->{inputfile_2} = undef;
389 }
390 else{
391 if ($bowtie2){
392 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
393 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
394
395 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
396 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
397 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
398 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
399 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
400 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
401 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
402 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
403 }
404 else{ # Bowtie 1 alignments
405 if ($gzip){
406 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time
407
408 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
409 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files
410 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
411 $fhs[1]->{inputfile_2} = undef;
412 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
413 $fhs[2]->{inputfile_2} = undef;
414 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
415 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files
416 }
417 else{ # uncompressed temp files
418 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
419 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
420
421 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
422 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
423 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
424 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
425 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
426 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
427 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
428 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
429 }
430 }
431 }
432 if ($bowtie2){
433 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
434 }
435 else{
436 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
437 }
438 }
439 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
440 }
441
442 ### Else we are performing SINGLE-END ALIGNMENTS
443 else{
444 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
445
446 $single_end = 1;
447 $paired_end = 0;
448
449 ### subsetting the input file(s)
450 unless ($multicore == 1){ # not needed in single-core mode
451 # warn "My PID: $pid\nMy offset: $offset\n";
452 if ($sequence_file_format eq 'FASTA'){
453 my $temp_filename = subset_input_file_FastA($filename,$pid,$offset);
454 warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n";
455 $filename = "${temp_dir}$temp_filename";
456 }
457 else{ # FastQ format, default
458 my $temp_filename = subset_input_file_FastQ($filename,$pid,$offset);
459 warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n";
460 $filename = "${temp_dir}$temp_filename";
461 }
462 }
463
464 ### Initialising bisulfite conversion filenames
465 my ($C_to_T_infile,$G_to_A_infile);
466
467 ### FastA format
468 if ($sequence_file_format eq 'FASTA'){
469 warn "Inut file is in FastA format\n";
470 if ($directional){
471 ($C_to_T_infile) = biTransformFastAFiles ($filename);
472 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
473 }
474 else{
475 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
476 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
477 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
478 }
479
480 ### Creating 4 different bowtie filehandles and storing the first entry
481 if ($bowtie2){
482 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
483 }
484 else{
485 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
486 }
487 }
488
489 ## FastQ format
490 else{
491 warn "Input file is in FastQ format\n";
492 if ($directional){
493 ($C_to_T_infile) = biTransformFastQFiles ($filename);
494 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
495 }
496 elsif($pbat){
497 ($G_to_A_infile) = biTransformFastQFiles ($filename);
498 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files
499 }
500 else{
501 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
502 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
503 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
504 }
505
506 ### Creating up to 4 different bowtie filehandles and storing the first entry
507 if ($pbat){
508 if ($bowtie2){ # as of version 0.10.2 we also support PBAT alignments for Bowtie 2
509 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 (undef,$G_to_A_infile);
510 }
511 else{
512 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile);
513 }
514 }
515 elsif ($bowtie2){
516 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
517 }
518 else{
519 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
520 }
521 }
522
523 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile,$pid);
524
525 }
526
527 ### MERGING AND DELETING TEMP FILES // TIDYING UP AFTER A MULTICORE PROCESS
528
529 if ($pid){ # only performing this for the parent process
530
531 if ($multicore > 1){
532
533 warn "Now waiting for all child processes to complete\n";
534
535 ### we need to ensure that we wait for all child processes to be finished before continuing
536 # warn "here are the child IDs: @pids\n";
537 # warn "Looping through the child process IDs:\n";
538
539 foreach my $id (@pids){
540 # print "$id\t";
541 my $kid = waitpid ($id,0);
542 # print "Returned: $kid\nExit status: $?\n";
543 unless ($? == 0){
544 warn "\nChild process terminated with exit signal: '$?'\n\n";
545 }
546 }
547
548 # regenerating names for temporary files
549 my @temp_input;
550 my @temp_output;
551 my @temp_reports;
552 my @temp_unmapped_1; # will store single end reads or R1 of paired-end
553 my @temp_unmapped_2;
554 my @temp_ambiguous_1; # will store single end reads or R1 of paired-end
555 my @temp_ambiguous_2;
556
557 for (1..$offset){
558
559 # Temp Input Files
560 if ($single_end){
561 if ($gzip){
562 push @temp_input, "${original_filename}.temp.${_}.gz";
563 }
564 else{
565 push @temp_input, "${original_filename}.temp.${_}";
566 }
567
568 }
569 elsif($paired_end){
570 if ($gzip){
571 push @temp_input, "${original_filename_1}.temp.${_}.gz";
572 push @temp_input, "${original_filename_2}.temp.${_}.gz";
573 }
574 else{
575 push @temp_input, "${original_filename_1}.temp.${_}";
576 push @temp_input, "${original_filename_2}.temp.${_}";
577 }
578 }
579
580 # if files had a prefix we need to specify it
581 my $add_prefix;
582 if (defined $prefix){
583 $add_prefix = "${prefix}.";
584 }
585 else{
586 $add_prefix = '';
587 }
588
589 # Temp Output Files
590 if ($single_end){
591
592 if ($bowtie2){
593 if ($gzip){
594 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2.bam";
595 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2_SE_report.txt";
596 }
597 else{
598 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2.bam";
599 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2_SE_report.txt";
600 }
601 }
602 else{
603 if ($gzip){
604 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark.bam";
605 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_SE_report.txt";
606 }
607 else{
608 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark.bam";
609 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_SE_report.txt";
610 }
611 }
612
613 if ($unmapped){
614 if ($gzip){
615 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_unmapped_reads.fq";
616 }
617 else{
618 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_unmapped_reads.fq";
619 }
620 }
621
622 if ($ambiguous){
623 if ($gzip){
624 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_ambiguous_reads.fq";
625 }
626 else{
627 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_ambiguous_reads.fq";
628 }
629 }
630
631 }
632 elsif($paired_end){
633 if ($bowtie2){
634 if ($gzip){
635 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_pe.bam";
636 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_PE_report.txt";
637 }
638 else{
639 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_pe.bam";
640 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_PE_report.txt";
641 }
642 }
643 else{
644 if ($gzip){
645 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_pe.bam";
646 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_PE_report.txt";
647 }
648 else{
649 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_pe.bam";
650 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_PE_report.txt";
651 }
652 }
653
654 if ($unmapped){
655 if ($gzip){
656 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_unmapped_reads_1.fq";
657 push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_unmapped_reads_2.fq";
658 }
659 else{
660 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_unmapped_reads_1.fq";
661 push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_unmapped_reads_2.fq";
662 }
663 }
664
665 if ($ambiguous){
666 if ($gzip){
667 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_ambiguous_reads_1.fq";
668 push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_ambiguous_reads_2.fq";
669 }
670 else{
671 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_ambiguous_reads_1.fq";
672 push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_ambiguous_reads_2.fq";
673 }
674 }
675
676 }
677 }
678
679 warn "\n\nRight, cleaning up now...\n\n";
680
681 # deleting temp files;
682 warn "Deleting temporary sequence files...\n";
683 foreach my $temp (@temp_input){
684 #print "$temp\t";
685 $temp =~ s/.*\///; # deleting path information
686 print "${temp_dir}${temp}\t";
687 unlink "${temp_dir}${temp}" or warn "Failed to delete temporary FastQ file ${temp_dir}$temp: $!\n";
688 }
689 print "\n\n";
690
691 # merging temp BAM files
692 if ($single_end){
693 merge_individual_BAM_files(\@temp_output,$original_filename,$single_end);
694 }
695 else{
696 merge_individual_BAM_files(\@temp_output,$original_filename_1,$single_end);
697 }
698
699 # deleting temp BAM files
700 warn "Deleting temporary BAM files...\n";
701 foreach my $temp (@temp_output){
702 # print "$temp\t";
703 $temp =~ s/.*\///; # deleting path information
704 print "${output_dir}${temp}\t";
705 unlink "${output_dir}${temp}" or warn "Failed to delete temporary BAM file ${output_dir}${temp}: $!\n";
706 }
707 print "\n\n";
708
709 if ($unmapped){
710 if ($single_end){
711 merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename,$single_end);
712 }
713 else{
714 merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename_1,$single_end,'_1');
715 merge_individual_unmapped_files(\@temp_unmapped_2,$original_filename_2,$single_end,'_2');
716 }
717
718 # deleting temp unmapped files
719 warn "Deleting temporary unmapped files...\n";
720 foreach my $temp (@temp_unmapped_1){
721 print "$temp\t";
722 unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n";
723 }
724 if ($paired_end){
725 foreach my $temp (@temp_unmapped_2){
726 print "$temp\t";
727 unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n";
728 }
729 }
730 print "\n\n";
731
732 }
733
734 if ($ambiguous){
735 if ($single_end){
736 merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename,$single_end);
737 }
738 else{
739 merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename_1,$single_end,'_1');
740 merge_individual_ambiguous_files(\@temp_ambiguous_2,$original_filename_2,$single_end,'_2');
741 }
742
743 # deleting temp ambiguous files
744 warn "Deleting temporary ambiguous files...\n";
745 foreach my $temp (@temp_ambiguous_1){
746 print "$temp\t";
747 unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n";
748 }
749
750 if ($paired_end){
751 foreach my $temp (@temp_ambiguous_2){
752 print "$temp\t";
753 unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n";
754 }
755 }
756 print "\n\n";
757 }
758
759 # resetting the counters once more so we can add all data from all temporary reports
760 reset_counters_and_fhs($original_filename);
761
762 ### Merging the Bismark mapping report files
763 if ($single_end){
764 merge_individual_splitting_reports(\@temp_reports,$original_filename,$single_end);
765 print_final_analysis_report_single_end('mock_file1','mock_file_2','mock_pid','mergeThis');
766 }
767 else{
768 merge_individual_splitting_reports(\@temp_reports,$original_filename_1,$single_end,$original_filename_2);
769 print_final_analysis_report_paired_ends('mock_file1','mock_file_2','mock_file3','mock_file_4','mock_pid','mergeThis');
770 }
771
772 # deleting temp report files
773 warn "Deleting temporary report files...\n";
774 foreach my $temp (@temp_reports){
775 print "$temp\t";
776 unlink "${output_dir}${temp}" or warn "Failed to delete temporary report file $output_dir$temp: $!\n";
777 }
778 print "\n\n";
779
780 }
781
782 }
783
784 if ($pid){ # only for the Parent
785 warn "\n====================\nBismark run complete\n====================\n\n";
786 }
787
788 }
789
790 sub merge_individual_splitting_reports{
791
792 my ($temp_reports,$original_filename_1,$single_end,$original_filename_2) = @_;
793 my $report_file = $original_filename_1;
794 $report_file =~ s/.*\///; # removing path information
795 if ($prefix){
796 $report_file = "${prefix}.${report_file}";
797 }
798
799 if ($basename){ # Output file basename is set using the -B argument
800 $report_file = ${basename};
801 }
802
803 if ($single_end){
804 if ($bowtie2){
805 $report_file .= '_bismark_bt2_SE_report.txt';
806 }
807 else{
808 $report_file .= '_bismark_SE_report.txt';
809 }
810 }
811 else{
812 if ($bowtie2){
813 $report_file .= '_bismark_bt2_PE_report.txt';
814 }
815 else{
816 $report_file .= '_bismark_PE_report.txt';
817 }
818 }
819 warn "Writing report to ${output_dir}${report_file}\n";
820 open (REPORT,'>',"$output_dir$report_file") or die "Failed to write to ${output_dir}${report_file}: $!\n";
821
822 foreach my $temp(@$temp_reports){
823 $temp =~ s/.*\///; # removing path information
824 }
825
826 warn "Now merging temporary reports @$temp_reports into >>> ${output_dir}${report_file} <<<\n";
827
828 if ($single_end){
829 print REPORT "Bismark report for: $original_filename_1 (version: $bismark_version)\n";
830 }
831 else{ # paired-end
832 print REPORT "Bismark report for: $original_filename_1 and $original_filename_2 (version: $bismark_version)\n";
833 }
834
835
836 my $first = 0;
837
838 foreach my $temp(@$temp_reports){
839 # $temp =~ s/.*\///; # removing path information
840
841 warn "Merging from file >> $temp <<\n";
842 open (IN,"${output_dir}${temp}") or die "Failed to read from temporary mapping report '${output_dir}${temp}'\n";
843
844 ### this is printing the first couple of lines
845 while (<IN>){
846 chomp;
847 if ($_ =~ /^Bismark report/){
848 next;
849 }
850
851 unless ($first){ # only happens for the first run we are processing
852 if ($_ =~ /^Final Alignment/){
853 ++$first;
854 last;
855 }
856 else{
857 print REPORT "$_\n";
858 }
859 }
860 }
861 close IN or warn "Failed to close filehandle\n";
862
863 ### Simon says: You are going to regret this in the future. Just for the record. He might be right...
864 read_alignment_report($temp,$single_end);
865
866 }
867 warn "\n";
868
869 }
870
871 sub read_alignment_report{
872 my ($report,$single_end) = @_;
873
874 my $unique;
875 my $no_aln;
876 my $multiple;
877 my $no_genomic;
878 my $total_seqs;
879 my $bismark_version;
880 my $input_filename;
881
882 my $unique_text;
883 my $no_aln_text;
884 my $multiple_text;
885 my $total_seq_text;
886
887 my $total_C_count;
888 my ($meth_CpG,$meth_CHG,$meth_CHH,$meth_unknown);
889 my ($unmeth_CpG,$unmeth_CHG,$unmeth_CHH,$unmeth_unknown);
890
891 my $number_OT;
892 my $number_CTOT;
893 my $number_CTOB;
894 my $number_OB;
895
896 open (ALN,"${output_dir}${report}") or die "Failed to read from temporary mapping report '$output_dir$report'\n";
897
898 while (<ALN>){
899 chomp;
900
901 ### General Alignment stats
902 if ($_ =~ /^Sequence pairs analysed in total:/ ){ ## Paired-end
903 (undef,$total_seqs) = split /\t/;
904 # warn "Total paired seqs: >> $total_seqs <<\n";
905 }
906 elsif ($_ =~ /^Sequences analysed in total:/ ){ ## Single-end
907 (undef,$total_seqs) = split /\t/;
908 # warn "total single-end seqs >> $total_seqs <<\n";
909 }
910
911 elsif($_ =~ /^Number of paired-end alignments with a unique best hit:/){ ## Paired-end
912 (undef,$unique) = split /\t/;
913 # warn "Unique PE>> $unique <<\n";
914 }
915 elsif($_ =~ /^Number of alignments with a unique best hit from/){ ## Single-end
916 (undef,$unique) = split /\t/;
917 # warn "Unique SE>> $unique <<\n";
918 }
919
920 elsif($_ =~ /^Sequence pairs with no alignments under any condition:/){ ## Paired-end
921 (undef,$no_aln) = split /\t/;
922 # warn "No alignment PE >> $no_aln <<\n";
923 }
924 elsif($_ =~ /^Sequences with no alignments under any condition:/){ ## Single-end
925 (undef,$no_aln) = split /\t/;
926 # warn "No alignments SE>> $no_aln <<\n";
927 }
928
929 elsif($_ =~ /^Sequence pairs did not map uniquely:/){ ## Paired-end
930 (undef,$multiple) = split /\t/;
931 # warn "Multiple alignments PE >> $multiple <<\n";
932 }
933 elsif($_ =~ /^Sequences did not map uniquely:/){ ## Single-end
934 (undef,$multiple) = split /\t/;
935 # warn "Multiple alignments SE >> $multiple <<\n";
936 }
937
938 elsif($_ =~ /^Sequence pairs which were discarded because genomic sequence could not be extracted:/){ ## Paired-end
939 (undef,$no_genomic) = split /\t/;
940 # warn "No genomic sequence PE >> $no_genomic <<\n";
941 }
942 elsif($_ =~ /^Sequences which were discarded because genomic sequence could not be extracted:/){ ## Single-end
943 (undef,$no_genomic) = split /\t/;
944 # warn "No genomic sequence SE>> $no_genomic <<\n";
945 }
946
947 ### Context Methylation
948 elsif($_ =~ /^Total number of C/ ){
949 (undef,$total_C_count) = split /\t/;
950 # warn "Total number C >> $total_C_count <<\n";
951 }
952
953 elsif($_ =~ /^Total methylated C\'s in CpG context:/ ){
954 (undef,$meth_CpG) = split /\t/;
955 # warn "meth CpG >> $meth_CpG <<\n" ;
956 }
957 elsif($_ =~ /^Total methylated C\'s in CHG context:/ ){
958 (undef,$meth_CHG) = split /\t/;
959 # warn "meth CHG >> $meth_CHG <<\n" ;
960 }
961 elsif($_ =~ /^Total methylated C\'s in CHH context:/ ){
962 (undef,$meth_CHH) = split /\t/;
963 # warn "meth CHH >> $meth_CHH <<\n" ;
964 }
965 elsif($_ =~ /^Total methylated C\'s in Unknown context:/ ){
966 (undef,$meth_unknown) = split /\t/;
967 # warn "meth Unknown >> $meth_unknown <<\n" ;
968 }
969
970 elsif($_ =~ /^Total unmethylated C\'s in CpG context:/ or $_ =~ /^Total C to T conversions in CpG context:/){
971 (undef,$unmeth_CpG) = split /\t/;
972 # warn "unmeth CpG >> $unmeth_CpG <<\n" ;
973 }
974 elsif($_ =~ /^Total unmethylated C\'s in CHG context:/ or $_ =~ /^Total C to T conversions in CHG context:/){
975 (undef,$unmeth_CHG) = split /\t/;
976 # warn "unmeth CHG >> $unmeth_CHG <<\n" ;
977 }
978 elsif($_ =~ /^Total unmethylated C\'s in CHH context:/ or $_ =~ /^Total C to T conversions in CHH context:/){
979 (undef,$unmeth_CHH) = split /\t/;
980 # warn "unmeth CHH >> $unmeth_CHH <<\n";
981 }
982 elsif($_ =~ /^Total unmethylated C\'s in Unknown context:/ or $_ =~ /^Total C to T conversions in Unknown context:/){
983 (undef,$unmeth_unknown) = split /\t/;
984 # warn "unmeth Unknown >> $unmeth_unknown <<\n" ;
985 }
986
987 ### Strand Origin
988
989 elsif($_ =~ /^CT\/GA\/CT:/ ){ ## Paired-end
990 (undef,$number_OT) = split /\t/;
991 # warn "Number OT PE>> $number_OT <<\n" ;
992 }
993 elsif($_ =~ /^CT\/CT:/ ){ ## Single-end
994 (undef,$number_OT) = split /\t/;
995 # warn "Number OT SE>> $number_OT <<\n" ;
996 }
997
998 elsif($_ =~ /^GA\/CT\/CT:/ ){ ## Paired-end
999 (undef,$number_CTOT) = split /\t/;
1000 # warn "Number CTOT PE >> $number_CTOT <<\n" ;
1001 }
1002 elsif($_ =~ /^GA\/CT:/ ){ ## Single-end
1003 (undef,$number_CTOT) = split /\t/;
1004 # warn "Number CTOT SE >> $number_CTOT <<\n" ;
1005 }
1006
1007 elsif($_ =~ /^GA\/CT\/GA:/ ){ ## Paired-end
1008 (undef,$number_CTOB) = split /\t/;
1009 # warn "Number CTOB PE >> $number_CTOB <<\n" ;
1010 }
1011 elsif($_ =~ /^GA\/GA:/ ){ ## Single-end
1012 (undef,$number_CTOB) = split /\t/;
1013 # warn "Number CTOB SE >> $number_CTOB <<\n";
1014 }
1015
1016 elsif($_ =~ /^CT\/GA\/GA:/ ){ ## Paired-end
1017 (undef,$number_OB) = split /\t/;
1018 # warn "Number OB PE >> $number_OB <<\n";
1019 }
1020 elsif($_ =~ /^CT\/GA:/ ){ ## Single-end
1021 (undef,$number_OB) = split /\t/;
1022 # warn "Number OB SE >> $number_OB <<\n";
1023 }
1024 }
1025
1026 $counting{sequences_count} += $total_seqs;
1027 $counting{unique_best_alignment_count} += $unique;
1028 $counting{no_single_alignment_found} += $no_aln;
1029 $counting{unsuitable_sequence_count} += $multiple;
1030 $counting{genomic_sequence_could_not_be_extracted_count} += $no_genomic;
1031
1032 $counting{total_meCHH_count} += $meth_CHH;
1033 $counting{total_meCHG_count} += $meth_CHG;
1034 $counting{total_meCpG_count} += $meth_CpG;
1035 if ($bowtie2){
1036 $counting{total_meC_unknown_count} += $meth_unknown;
1037 }
1038
1039 $counting{total_unmethylated_CHH_count} += $unmeth_CHH;
1040 $counting{total_unmethylated_CHG_count} += $unmeth_CHG;
1041 $counting{total_unmethylated_CpG_count} += $unmeth_CpG;
1042 if ($bowtie2){
1043 $counting{total_unmethylated_C_unknown_count} += $unmeth_unknown;
1044 }
1045
1046 if ($single_end){
1047 $counting{CT_CT_count} += $number_OT;
1048 $counting{CT_GA_count} += $number_OB;
1049 $counting{GA_CT_count} += $number_CTOT;
1050 $counting{GA_GA_count} += $number_CTOB;
1051 }
1052 else{
1053 # paired-end
1054 $counting{GA_CT_CT_count} += $number_CTOT;
1055 $counting{CT_GA_CT_count} += $number_OT;
1056 $counting{GA_CT_GA_count} += $number_CTOB;
1057 $counting{CT_GA_GA_count} += $number_OB;
1058 }
1059 }
1060
1061 sub merge_individual_ambiguous_files{
1062
1063 my ($temp_ambiguous,$original_filename,$single_end,$paired_information) = @_;
1064 my $ambiguous_file = $original_filename;
1065 $ambiguous_file =~ s/.*\///; # removing path information
1066
1067 if ($prefix){
1068 $ambiguous_file = "${prefix}.${ambiguous_file}";
1069 }
1070
1071 if ($single_end){
1072
1073 if ($basename){ # Output file basename is set using the -B argument
1074 if ($sequence_file_format eq 'FASTQ'){
1075 $ambiguous_file = "${basename}_ambiguous_reads.fq.gz";
1076 }
1077 else{
1078 $ambiguous_file = "${basename}_ambiguous_reads.fa.gz";
1079 }
1080 }
1081 else{
1082 if ($sequence_file_format eq 'FASTQ'){
1083 $ambiguous_file =~ s/$/_ambiguous_reads.fq.gz/;
1084 }
1085 else{
1086 $ambiguous_file =~ s/$/_ambiguous_reads.fa.gz/;
1087 }
1088 }
1089 }
1090 else{ # paired-end
1091
1092 if ($basename){ # Output file basename is set using the -B argument
1093 if ($sequence_file_format eq 'FASTQ'){
1094 $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fq.gz";
1095 }
1096 else{
1097 $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fa.gz";
1098 }
1099 }
1100 else{
1101 if ($sequence_file_format eq 'FASTQ'){
1102 $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fq.gz/;
1103 }
1104 else{
1105 $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fa.gz/;
1106 }
1107 }
1108 }
1109
1110 foreach my $temp(@$temp_ambiguous){
1111 $temp =~ s/.*\///; # removing path information
1112 }
1113
1114 open (AMBIGUOUS,"| gzip -c - > $output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
1115 warn "Now merging ambiguous sequences @$temp_ambiguous into >>> $output_dir$ambiguous_file <<<\n";
1116
1117 foreach my $temp(@$temp_ambiguous){
1118 warn "Merging from file >> $temp <<\n";
1119 if ($temp =~ /gz$/){
1120 open (IN,"zcat ${output_dir}$temp |") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n";
1121 }
1122 else{
1123 open (IN,"${output_dir}$temp") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n";
1124 }
1125
1126 while (<IN>){
1127 print AMBIGUOUS;
1128 }
1129 close IN or warn "Failed to close filehandle\n";
1130 }
1131 warn "\n";
1132
1133 close AMBIGUOUS or warn "Failed to close output filehandle AMBIGUOUS\n\n";
1134 }
1135
1136
1137 sub merge_individual_unmapped_files{
1138
1139 my ($temp_unmapped,$original_filename,$single_end,$paired_information) = @_;
1140 my $unmapped_file = $original_filename;
1141 $unmapped_file =~ s/.*\///; # removing path information
1142
1143 if ($prefix){
1144 $unmapped_file = "${prefix}.${unmapped_file}";
1145 }
1146
1147 if ($single_end){
1148
1149 if ($basename){ # Output file basename is set using the -B argument
1150 if ($sequence_file_format eq 'FASTQ'){
1151 $unmapped_file = "${basename}_unmapped_reads.fq.gz";
1152 }
1153 else{
1154 $unmapped_file = "${basename}_unmapped_reads.fa.gz";
1155 }
1156 }
1157 else{
1158 if ($sequence_file_format eq 'FASTQ'){
1159 $unmapped_file =~ s/$/_unmapped_reads.fq.gz/;
1160 }
1161 else{
1162 $unmapped_file =~ s/$/_unmapped_reads.fa.gz/;
1163 }
1164 }
1165 }
1166 else{ # paired-end
1167
1168 if ($basename){ # Output file basename is set using the -B argument
1169 if ($sequence_file_format eq 'FASTQ'){
1170 $unmapped_file = "${basename}_unmapped_reads${paired_information}.fq.gz";
1171 }
1172 else{
1173 $unmapped_file = "${basename}_unmapped_reads${paired_information}.fa.gz";
1174 }
1175 }
1176 else{
1177 if ($sequence_file_format eq 'FASTQ'){
1178 $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fq.gz/;
1179 }
1180 else{
1181 $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fa.gz/;
1182 }
1183 }
1184 }
1185
1186 foreach my $temp(@$temp_unmapped){
1187 $temp =~ s/.*\///; # removing path information
1188 }
1189
1190 open (UNMAPPED,"| gzip -c - > ${output_dir}${unmapped_file}") or die "Failed to write to ${output_dir}${unmapped_file}: $!\n";
1191 warn "Now merging unmapped sequences @$temp_unmapped into >>> ${output_dir}${unmapped_file} <<<\n";
1192
1193 foreach my $temp(@$temp_unmapped){
1194 warn "Merging from file >> $temp <<\n";
1195 if ($temp =~ /gz$/){
1196 open (IN,"zcat ${output_dir}${temp} |") or die "Failed to read from unmapped temp file '${output_dir}$temp'\n";
1197 }
1198 else{
1199 open (IN,"${output_dir}${temp}") or die "Failed to read from unmapped temp file '${output_dir}${temp}'\n";
1200 }
1201
1202 while (<IN>){
1203 print UNMAPPED;
1204 }
1205 close IN or warn "Failed to close filehandle\n";
1206 }
1207 warn "\n";
1208
1209 close UNMAPPED or warn "Failed to close output filehandle UNMAPPED\n\n";
1210 }
1211
1212
1213 sub merge_individual_BAM_files{
1214
1215 my ($tempbam,$original_filename,$single_end) = @_;
1216 my $merged_name = $original_filename;
1217
1218 # warn "merged name is: $merged_name\n";
1219 $merged_name =~ s/.*\///; # deleting path information
1220 # warn "merged name is: $merged_name\n"; sleep(1);
1221
1222 foreach my $temp_bam(@$tempbam){
1223 $temp_bam =~ s/.*\///; # deleting path information
1224 }
1225
1226 if ($prefix){
1227 $merged_name = "$prefix.$merged_name";
1228 }
1229
1230 if ($single_end){
1231 if ($bowtie2){ # BAM format is the default for Bowtie 2
1232 $merged_name .= '_bismark_bt2.bam';
1233 }
1234 else{ # BAM is the default output
1235 $merged_name .= '_bismark.bam';
1236 }
1237
1238 if ($basename){ # Output file basename is set using the -B argument
1239 $merged_name = "${basename}.bam";
1240 }
1241 }
1242 else{
1243 if ($bowtie2){ # BAM format is the default for Bowtie 2
1244 $merged_name .= '_bismark_bt2_pe.bam';
1245 }
1246 else{ # BAM is the default output
1247 $merged_name .= '_bismark_pe.bam';
1248 }
1249
1250 if ($basename){ # Output file basename is set using the -B argument
1251 $merged_name = "${basename}_pe.bam";
1252 }
1253 }
1254
1255 warn "Now merging BAM files @$tempbam into >>> $merged_name <<<\n";
1256 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to $merged_name: $!\n";
1257 my $first = 0;
1258
1259 foreach my $temp_bam(@$tempbam){
1260 # $temp_bam =~ s/.*\///; # deleting path information
1261
1262 warn "Merging from file >> $temp_bam <<\n";
1263
1264 if ($first > 0){
1265 open (IN,"$samtools_path view ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n";
1266 }
1267 else{ # only for the first file we print the header as well
1268 open (IN,"$samtools_path view -h ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n";
1269 }
1270
1271 while (<IN>){
1272 print OUT;
1273 }
1274 close IN or warn "Failed to close filehandle\n";
1275 ++$first;
1276 }
1277 warn "\n";
1278
1279 close OUT or warn "Failed to close output filehandle\n\n";
1280 }
1281
1282 sub start_methylation_call_procedure_single_ends {
1283 my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_;
1284 my ($dir,$filename);
1285
1286 if ($sequence_file =~ /\//){
1287 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
1288 }
1289 else{
1290 $filename = $sequence_file;
1291 }
1292
1293 ### printing all alignments to a results file
1294 my $outfile = $filename;
1295 if ($prefix){
1296 $outfile = "$prefix.$outfile";
1297 }
1298 if ($bowtie2){ # SAM format is the default for Bowtie 2
1299 $outfile =~ s/$/_bismark_bt2.sam/;
1300 }
1301 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
1302 $outfile =~ s/$/_bismark.txt/;
1303 }
1304 else{ # SAM is the default output
1305 $outfile =~ s/$/_bismark.sam/;
1306 }
1307
1308 if ($basename){ # Output file basename is set using the -B argument
1309 $outfile = "${basename}.sam";
1310 }
1311
1312 $bam = 0 unless (defined $bam);
1313
1314 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
1315 $outfile =~ s/sam$/bam/;
1316 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
1317 }
1318 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
1319 $outfile .= '.gz';
1320 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
1321 }
1322 else{ # uncompressed ouput, default
1323 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
1324 }
1325
1326 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n";
1327 sleep(1);
1328
1329 if ($vanilla){
1330 print OUT "Bismark version: $bismark_version\n";
1331 }
1332
1333 ### printing alignment and methylation call summary to a report file
1334 my $reportfile = $filename;
1335 if ($prefix){
1336 $reportfile = "$prefix.$reportfile";
1337 }
1338 if ($bowtie2){
1339 $reportfile =~ s/$/_bismark_bt2_SE_report.txt/;
1340 }
1341 else{
1342 $reportfile =~ s/$/_bismark_SE_report.txt/;
1343 }
1344
1345 if ($basename){ # Output file basename is set using the -B argument
1346 $reportfile = "${basename}_SE_report.txt";
1347 }
1348
1349 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
1350 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
1351
1352 if ($unmapped){
1353 my $unmapped_file = $filename;
1354 if ($prefix){
1355 $unmapped_file = "$prefix.$unmapped_file";
1356 }
1357
1358 if ($basename){ # Output file basename is set using the -B argument
1359 if ($sequence_file_format eq 'FASTQ'){
1360 $unmapped_file = "${basename}_unmapped_reads.fq";
1361 }
1362 else{
1363 $unmapped_file = "${basename}_unmapped_reads.fa";
1364 }
1365 }
1366 else{
1367 if ($sequence_file_format eq 'FASTQ'){
1368 $unmapped_file =~ s/$/_unmapped_reads.fq/;
1369 }
1370 else{
1371 $unmapped_file =~ s/$/_unmapped_reads.fa/;
1372 }
1373 }
1374
1375 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
1376 warn "Unmapped sequences will be written to $output_dir$unmapped_file\n";
1377 }
1378
1379 if ($ambiguous){
1380 my $ambiguous_file = $filename;
1381
1382 if ($prefix){
1383 $ambiguous_file = "$prefix.$ambiguous_file";
1384 }
1385
1386 if ($basename){ # Output file basename is set using the -B argument
1387 if ($sequence_file_format eq 'FASTQ'){
1388 $ambiguous_file = "${basename}_ambiguous_reads.fq";
1389 }
1390 else{
1391 $ambiguous_file = "${basename}_ambiguous_reads.fa";
1392 }
1393 }
1394 else{
1395 if ($sequence_file_format eq 'FASTQ'){
1396 $ambiguous_file =~ s/$/_ambiguous_reads.fq/;
1397 }
1398 else{
1399 $ambiguous_file =~ s/$/_ambiguous_reads.fa/;
1400 }
1401 }
1402 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
1403 warn "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
1404 }
1405
1406 if ($directional){
1407 print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n";
1408 }
1409 elsif ($pbat){
1410 print REPORT "Option '--pbat' specified: alignments to original strands (OT and OB) strands were ignored (i.e. not performed)\n";
1411 }
1412 else{
1413 print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n";
1414 }
1415
1416 if ($bowtie2){
1417 print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
1418 }
1419 else{
1420 print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
1421 }
1422
1423 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
1424 unless (%chromosomes){
1425 my $cwd = getcwd; # storing the path of the current working directory
1426 print "Current working directory is: $cwd\n\n";
1427 read_genome_into_memory($cwd);
1428 }
1429
1430 unless ($vanilla or $sam_no_hd){
1431 generate_SAM_header();
1432 }
1433
1434 ### Input file is in FastA format
1435 if ($sequence_file_format eq 'FASTA'){
1436 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid);
1437 }
1438 ### Input file is in FastQ format
1439 else{
1440 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid);
1441 }
1442 }
1443
1444 sub start_methylation_call_procedure_paired_ends {
1445 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_;
1446
1447 my ($dir_1,$filename_1);
1448
1449 if ($sequence_file_1 =~ /\//){
1450 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
1451 }
1452 else{
1453 $filename_1 = $sequence_file_1;
1454 }
1455
1456 my ($dir_2,$filename_2);
1457
1458 if ($sequence_file_2 =~ /\//){
1459 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
1460 }
1461 else{
1462 $filename_2 = $sequence_file_2;
1463 }
1464
1465 ### printing all alignments to a results file
1466 my $outfile = $filename_1;
1467
1468 if ($prefix){
1469 $outfile = "$prefix.$outfile";
1470 }
1471 if ($bowtie2){ # SAM format is the default Bowtie 2 output
1472 $outfile =~ s/$/_bismark_bt2_pe.sam/;
1473 }
1474 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
1475 $outfile =~ s/$/_bismark_pe.txt/;
1476 }
1477 else{ # SAM format is the default Bowtie 1 output
1478 $outfile =~ s/$/_bismark_pe.sam/;
1479 }
1480
1481 if ($basename){ # Output file basename is set using the -B argument
1482 $outfile = "${basename}_pe.sam";
1483 }
1484
1485
1486 $bam = 0 unless (defined $bam);
1487
1488 if ($bam == 1){ ### Samtools is installed, writing out BAM directly
1489 $outfile =~ s/sam$/bam/;
1490 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
1491 }
1492 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
1493 $outfile .= '.gz';
1494 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
1495 }
1496 else{ # uncompressed ouput, default
1497 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
1498 }
1499
1500 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n";
1501 sleep(1);
1502
1503 if ($vanilla){
1504 print OUT "Bismark version: $bismark_version\n";
1505 }
1506
1507 ### printing alignment and methylation call summary to a report file
1508 my $reportfile = $filename_1;
1509 if ($prefix){
1510 $reportfile = "$prefix.$reportfile";
1511 }
1512
1513 if ($bowtie2){
1514 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
1515 }
1516 else{
1517 $reportfile =~ s/$/_bismark_PE_report.txt/;
1518 }
1519
1520 if ($basename){ # Output file basename is set using the -B argument
1521 $reportfile = "${basename}_PE_report.txt";
1522 }
1523
1524 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
1525 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
1526
1527 if ($bowtie2){
1528 print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n";
1529 }
1530 else{
1531 print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n";
1532 }
1533
1534
1535 ### Unmapped read output
1536 if ($unmapped){
1537 my $unmapped_1 = $filename_1;
1538 my $unmapped_2 = $filename_2;
1539
1540 if ($prefix){
1541 $unmapped_1 = "$prefix.$unmapped_1";
1542 $unmapped_2 = "$prefix.$unmapped_2";
1543 }
1544
1545 if ($basename){ # Output file basename is set using the -B argument
1546 if ($sequence_file_format eq 'FASTQ'){
1547 $unmapped_1 = "${basename}_unmapped_reads_1.fq";
1548 $unmapped_2 = "${basename}_unmapped_reads_2.fq";
1549 }
1550 else{
1551 $unmapped_1 = "${basename}_unmapped_reads_1.fa";
1552 $unmapped_2 = "${basename}_unmapped_reads_2.fa";
1553 }
1554 }
1555 else{
1556 if ($sequence_file_format eq 'FASTQ'){
1557 $unmapped_1 =~ s/$/_unmapped_reads_1.fq/;
1558 $unmapped_2 =~ s/$/_unmapped_reads_2.fq/;
1559 }
1560 else{
1561 $unmapped_1 =~ s/$/_unmapped_reads_1.fa/;
1562 $unmapped_2 =~ s/$/_unmapped_reads_2.fa/;
1563 }
1564 }
1565
1566 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
1567 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
1568 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
1569 }
1570
1571 if ($ambiguous){
1572 my $amb_1 = $filename_1;
1573 my $amb_2 = $filename_2;
1574
1575 if ($prefix){
1576 $amb_1 = "$prefix.$amb_1";
1577 $amb_2 = "$prefix.$amb_2";
1578 }
1579
1580 if ($basename){ # Output file basename is set using the -B argument
1581 if ($sequence_file_format eq 'FASTQ'){
1582 $amb_1 = "${basename}_ambiguous_reads_1.fq";
1583 $amb_2 = "${basename}_ambiguous_reads_2.fq";
1584 }
1585 else{
1586 $amb_1 = "${basename}_ambiguous_reads_1.fa";
1587 $amb_2 = "${basename}_ambiguous_reads_2.fa";
1588 }
1589 }
1590 else{
1591 if ($sequence_file_format eq 'FASTQ'){
1592 $amb_1 =~ s/$/_ambiguous_reads_1.fq/;
1593 $amb_2 =~ s/$/_ambiguous_reads_2.fq/;
1594 }
1595 else{
1596 $amb_1 =~ s/$/_ambiguous_reads_1.fa/;
1597 $amb_2 =~ s/$/_ambiguous_reads_2.fa/;
1598 }
1599 }
1600
1601 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
1602 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
1603 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
1604 }
1605
1606 if ($directional){
1607 print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n\n";
1608 }
1609 elsif ($pbat){
1610 print REPORT "Option '--pbat' specified: alignments to original strands (OT, OB) were ignored (i.e. not performed)\n\n";
1611 }
1612 else{
1613 print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n\n";
1614 }
1615
1616
1617
1618
1619 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
1620 unless (%chromosomes){
1621 my $cwd = getcwd; # storing the path of the current working directory
1622 warn "Current working directory is: $cwd\n\n";
1623 read_genome_into_memory($cwd);
1624 }
1625
1626 unless ($vanilla or $sam_no_hd){
1627 generate_SAM_header();
1628 }
1629
1630 ### Input files are in FastA format
1631 if ($sequence_file_format eq 'FASTA'){
1632 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
1633 }
1634 ### Input files are in FastQ format
1635 else{
1636 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
1637 }
1638 }
1639
1640 sub print_final_analysis_report_single_end{
1641 my ($C_to_T_infile,$G_to_A_infile,$pid,$merge_multi) = @_;
1642
1643 if ($merge_multi){
1644 warn "Printing a final merged alignment report for all individual sub-reports\n\n";
1645 }
1646 else{
1647 ### All sequences from the original sequence file have been analysed now
1648 ### deleting temporary C->T or G->A infiles
1649
1650 if ($directional){
1651 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
1652 if ($deletion_successful == 1){
1653 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
1654 }
1655 else{
1656 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
1657 }
1658 }
1659 elsif ($pbat){
1660 my $deletion_successful = unlink "$temp_dir$G_to_A_infile";
1661 if ($deletion_successful == 1){
1662 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n";
1663 }
1664 else{
1665 warn "Could not delete temporary file $G_to_A_infile properly $!\n";
1666 }
1667 }
1668 else{
1669 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
1670 if ($deletion_successful == 2){
1671 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
1672 }
1673 else{
1674 warn "Could not delete temporary files properly $!\n";
1675 }
1676 }
1677 }
1678
1679 ### printing a final report for the alignment procedure
1680 print REPORT "Final Alignment report\n",'='x22,"\n";
1681 warn "Final Alignment report\n",'='x22,"\n";
1682 # foreach my $index (0..$#fhs){
1683 # print "$fhs[$index]->{name}\n";
1684 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
1685 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
1686 # }
1687
1688 ### printing a final report for the methylation call procedure
1689 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
1690 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
1691 my $percent_alignable_sequences;
1692
1693 if ($counting{sequences_count} == 0){
1694 $percent_alignable_sequences = 0;
1695 }
1696 else{
1697 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
1698 }
1699
1700 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
1701 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
1702
1703 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
1704 ### only calculating the percentage if there were any overruled alignments
1705 if ($counting{low_complexity_alignments_overruled_count}){
1706 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
1707 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
1708 }
1709
1710 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
1711 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
1712 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
1713 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
1714 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
1715
1716 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
1717 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
1718 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
1719 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
1720 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
1721
1722 if ($directional){
1723 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
1724 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
1725 }
1726
1727 ### detailed information about Cs analysed
1728 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
1729 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
1730 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
1731 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1732 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1733 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
1734 if ($bowtie2){
1735 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
1736 }
1737 warn "\n";
1738
1739 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1740 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1741 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
1742 if ($bowtie2){
1743 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
1744 }
1745 warn "\n";
1746
1747 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
1748 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
1749
1750 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1751 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1752 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
1753 if ($bowtie2){
1754 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
1755 }
1756 print REPORT "\n";
1757
1758 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1759 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1760 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
1761 if ($bowtie2){
1762 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
1763 }
1764 print REPORT "\n";
1765
1766 my $percent_meCHG;
1767 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
1768 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
1769 }
1770
1771 my $percent_meCHH;
1772 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
1773 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
1774 }
1775
1776 my $percent_meCpG;
1777 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
1778 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
1779 }
1780
1781 my $percent_meC_unknown;
1782 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
1783 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
1784 }
1785
1786
1787 ### printing methylated CpG percentage if applicable
1788 if ($percent_meCpG){
1789 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
1790 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
1791 }
1792 else{
1793 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1794 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1795 }
1796
1797 ### printing methylated C percentage (CHG context) if applicable
1798 if ($percent_meCHG){
1799 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
1800 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
1801 }
1802 else{
1803 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
1804 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
1805 }
1806
1807 ### printing methylated C percentage (CHH context) if applicable
1808 if ($percent_meCHH){
1809 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
1810 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
1811 }
1812 else{
1813 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
1814 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
1815 }
1816
1817 ### printing methylated C percentage (Unknown C context) if applicable
1818 if ($bowtie2){
1819 if ($percent_meC_unknown){
1820 warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
1821 print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
1822 }
1823 else{
1824 warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
1825 print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
1826 }
1827 }
1828 print REPORT "\n\n";
1829 warn "\n\n";
1830
1831 if ($seqID_contains_tabs){
1832 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
1833 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
1834 }
1835 }
1836
1837
1838 sub print_final_analysis_report_paired_ends{
1839 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid,$merge_multi) = @_;
1840
1841 if ($merge_multi){
1842 warn "Printing a final merged alignment report for all individual sub-reports\n\n";
1843 }
1844 else{
1845 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
1846 if ($directional){
1847 if ($G_to_A_infile_2){
1848 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
1849 if ($deletion_successful == 2){
1850 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
1851 }
1852 else{
1853 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
1854 }
1855 }
1856 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete
1857 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1";
1858 if ($deletion_successful == 1){
1859 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n";
1860 }
1861 else{
1862 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n";
1863 }
1864 }
1865 }
1866 else{
1867 if ($G_to_A_infile_2 and $C_to_T_infile_2){
1868 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
1869 if ($deletion_successful == 4){
1870 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
1871 }
1872 else{
1873 warn "Could not delete temporary files properly: $!\n";
1874 }
1875 }
1876 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete
1877 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1";
1878 if ($deletion_successful == 2){
1879 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n";
1880 }
1881 else{
1882 warn "Could not delete temporary files properly: $!\n";
1883 }
1884 }
1885 }
1886 }
1887
1888 ### printing a final report for the alignment procedure
1889 warn "Final Alignment report\n",'='x22,"\n";
1890 print REPORT "Final Alignment report\n",'='x22,"\n";
1891 # foreach my $index (0..$#fhs){
1892 # print "$fhs[$index]->{name}\n";
1893 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
1894 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
1895 # }
1896
1897 ### printing a final report for the methylation call procedure
1898 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
1899 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
1900
1901 my $percent_alignable_sequence_pairs;
1902 if ($counting{sequences_count} == 0){
1903 $percent_alignable_sequence_pairs = 0;
1904 }
1905 else{
1906 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
1907 }
1908 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
1909 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
1910
1911 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
1912 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
1913 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
1914 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
1915 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
1916
1917
1918 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
1919 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
1920 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
1921 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
1922 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
1923 ### detailed information about Cs analysed
1924
1925 if ($directional){
1926 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
1927 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
1928 }
1929
1930 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
1931 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
1932
1933 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
1934 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
1935 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1936 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1937 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
1938 if ($bowtie2){
1939 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n";
1940 }
1941 warn "\n";
1942
1943 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1944 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1945 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
1946 if ($bowtie2){
1947 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n";
1948 }
1949 warn "\n";
1950
1951 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
1952 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
1953 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
1954 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
1955 if ($bowtie2){
1956 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
1957 }
1958 print REPORT "\n";
1959
1960 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
1961 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
1962 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
1963 if ($bowtie2){
1964 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
1965 }
1966 print REPORT "\n";
1967
1968 my $percent_meCHG;
1969 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
1970 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
1971 }
1972
1973 my $percent_meCHH;
1974 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
1975 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
1976 }
1977
1978 my $percent_meCpG;
1979 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
1980 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
1981 }
1982
1983 my $percent_meC_unknown;
1984 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
1985 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
1986 }
1987
1988
1989 ### printing methylated CpG percentage if applicable
1990 if ($percent_meCpG){
1991 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
1992 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
1993 }
1994 else{
1995 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1996 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
1997 }
1998
1999 ### printing methylated C percentage in CHG context if applicable
2000 if ($percent_meCHG){
2001 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
2002 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
2003 }
2004 else{
2005 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2006 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
2007 }
2008
2009 ### printing methylated C percentage in CHH context if applicable
2010 if ($percent_meCHH){
2011 warn "C methylated in CHH context:\t${percent_meCHH}%\n";
2012 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
2013 }
2014 else{
2015 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
2016 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
2017 }
2018
2019 ### printing methylated C percentage (Unknown C context) if applicable
2020 if ($bowtie2){
2021 if ($percent_meC_unknown){
2022 warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
2023 print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
2024 }
2025 else{
2026 warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
2027 print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
2028 }
2029 }
2030 print REPORT "\n\n";
2031 warn "\n\n";
2032
2033 }
2034
2035 sub process_single_end_fastA_file_for_methylation_call{
2036 my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_;
2037 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
2038 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
2039 ### the C->T or G->A version
2040
2041 ### gzipped version of the infile
2042 if ($sequence_file =~ /\.gz$/){
2043 open (IN,"zcat $sequence_file |") or die $!;
2044 }
2045 else{
2046 open (IN,$sequence_file) or die $!;
2047 }
2048
2049 my $count = 0;
2050
2051 warn "\nReading in the sequence file $sequence_file\n";
2052 while (1) {
2053 # last if ($counting{sequences_count} > 100);
2054 my $identifier = <IN>;
2055 my $sequence = <IN>;
2056 last unless ($identifier and $sequence);
2057
2058 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
2059
2060 ++$count;
2061
2062 if ($skip){
2063 next unless ($count > $skip);
2064 }
2065 if ($upto){
2066 last if ($count > $upto);
2067 }
2068
2069 $counting{sequences_count}++;
2070 if ($counting{sequences_count}%1000000==0) {
2071 warn "Processed $counting{sequences_count} sequences so far\n";
2072 }
2073 chomp $sequence;
2074 chomp $identifier;
2075
2076 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
2077
2078 my $return;
2079 if ($bowtie2){
2080 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
2081 }
2082 else{
2083 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
2084 }
2085
2086 unless ($return){
2087 $return = 0;
2088 }
2089
2090 # print the sequence to ambiguous.out if --ambiguous was specified
2091 if ($ambiguous and $return == 2){
2092 print AMBIG ">$identifier\n";
2093 print AMBIG "$sequence\n";
2094 }
2095
2096 # print the sequence to <unmapped.out> file if --un was specified
2097 elsif ($unmapped and $return == 1){
2098 print UNMAPPED ">$identifier\n";
2099 print UNMAPPED "$sequence\n";
2100 }
2101 }
2102 print "Processed $counting{sequences_count} sequences in total\n\n";
2103
2104 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid);
2105
2106 }
2107
2108 sub process_single_end_fastQ_file_for_methylation_call{
2109
2110 my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_;
2111
2112 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
2113 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
2114 ### the C->T or G->A version
2115
2116 ### gzipped version of the infile
2117 if ($sequence_file =~ /\.gz$/){
2118 open (IN,"zcat $sequence_file |") or die $!;
2119 }
2120 else{
2121 open (IN,$sequence_file) or die $!;
2122 }
2123
2124 my $count = 0;
2125
2126 warn "\nReading in the sequence file $sequence_file\n";
2127 while (1) {
2128 my $identifier = <IN>;
2129 my $sequence = <IN>;
2130 my $identifier_2 = <IN>;
2131 my $quality_value = <IN>;
2132 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
2133
2134 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
2135
2136 ++$count;
2137
2138 if ($skip){
2139 next unless ($count > $skip);
2140 }
2141 if ($upto){
2142 last if ($count > $upto);
2143 }
2144
2145 $counting{sequences_count}++;
2146
2147 if ($counting{sequences_count}%1000000==0) {
2148 warn "Processed $counting{sequences_count} sequences so far\n";
2149 }
2150 chomp $sequence;
2151 chomp $identifier;
2152 chomp $quality_value;
2153
2154 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
2155
2156 my $return;
2157 if ($bowtie2){
2158 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
2159 }
2160 else{
2161 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
2162 }
2163
2164 unless ($return){
2165 $return = 0;
2166 }
2167
2168 # print the sequence to ambiguous.out if --ambiguous was specified
2169 if ($ambiguous and $return == 2){
2170 print AMBIG "\@$identifier\n";
2171 print AMBIG "$sequence\n";
2172 print AMBIG $identifier_2;
2173 print AMBIG "$quality_value\n";
2174 }
2175
2176 # print the sequence to <unmapped.out> file if --un was specified
2177 elsif ($unmapped and $return == 1){
2178 print UNMAPPED "\@$identifier\n";
2179 print UNMAPPED "$sequence\n";
2180 print UNMAPPED $identifier_2;
2181 print UNMAPPED "$quality_value\n";
2182 }
2183 }
2184 print "Processed $counting{sequences_count} sequences in total\n\n";
2185
2186 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid);
2187
2188 }
2189
2190 sub process_fastA_files_for_paired_end_methylation_calls{
2191 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_;
2192 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
2193 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
2194 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
2195 ### converted genomes (either the C->T or G->A version)
2196
2197 ### gzipped version of the infiles
2198 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
2199 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
2200 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
2201 }
2202 else{
2203 open (IN1,$sequence_file_1) or die $!;
2204 open (IN2,$sequence_file_2) or die $!;
2205 }
2206
2207 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
2208 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
2209
2210 my $count = 0;
2211
2212 while (1) {
2213 # reading from the first input file
2214 my $identifier_1 = <IN1>;
2215 my $sequence_1 = <IN1>;
2216 # reading from the second input file
2217 my $identifier_2 = <IN2>;
2218 my $sequence_2 = <IN2>;
2219 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
2220
2221 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
2222 $identifier_2 = fix_IDs($identifier_2);
2223
2224 ++$count;
2225
2226 if ($skip){
2227 next unless ($count > $skip);
2228 }
2229 if ($upto){
2230 last if ($count > $upto);
2231 }
2232
2233 $counting{sequences_count}++;
2234 if ($counting{sequences_count}%1000000==0) {
2235 warn "Processed $counting{sequences_count} sequence pairs so far\n";
2236 }
2237 my $orig_identifier_1 = $identifier_1;
2238 my $orig_identifier_2 = $identifier_2;
2239
2240 chomp $sequence_1;
2241 chomp $identifier_1;
2242 chomp $sequence_2;
2243 chomp $identifier_2;
2244
2245 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
2246
2247 my $return;
2248 if ($bowtie2){
2249 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
2250 }
2251 else{
2252 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
2253 }
2254
2255 unless ($return){
2256 $return = 0;
2257 }
2258
2259 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
2260 if ($ambiguous and $return == 2){
2261 print AMBIG_1 $orig_identifier_1;
2262 print AMBIG_1 "$sequence_1\n";
2263 print AMBIG_2 $orig_identifier_2;
2264 print AMBIG_2 "$sequence_2\n";
2265 }
2266
2267 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
2268 elsif ($unmapped and $return == 1){
2269 print UNMAPPED_1 $orig_identifier_1;
2270 print UNMAPPED_1 "$sequence_1\n";
2271 print UNMAPPED_2 $orig_identifier_2;
2272 print UNMAPPED_2 "$sequence_2\n";
2273 }
2274 }
2275
2276 warn "Processed $counting{sequences_count} sequences in total\n\n";
2277
2278 close OUT or die $!;
2279
2280 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
2281
2282 }
2283
2284 sub process_fastQ_files_for_paired_end_methylation_calls{
2285 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_;
2286 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
2287 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
2288 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
2289 ### of the converted genomes (either C->T or G->A version)
2290
2291 ### gzipped version of the infiles
2292 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
2293 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
2294 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
2295 }
2296 else{
2297 open (IN1,$sequence_file_1) or die $!;
2298 open (IN2,$sequence_file_2) or die $!;
2299 }
2300
2301 my $count = 0;
2302
2303 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
2304 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
2305 while (1) {
2306 # reading from the first input file
2307 my $identifier_1 = <IN1>;
2308 my $sequence_1 = <IN1>;
2309 my $ident_1 = <IN1>; # not needed
2310 my $quality_value_1 = <IN1>; # not needed
2311 # reading from the second input file
2312 my $identifier_2 = <IN2>;
2313 my $sequence_2 = <IN2>;
2314 my $ident_2 = <IN2>; # not needed
2315 my $quality_value_2 = <IN2>; # not needed
2316 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
2317
2318 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
2319 $identifier_2 = fix_IDs($identifier_2);
2320
2321 ++$count;
2322
2323 if ($skip){
2324 next unless ($count > $skip);
2325 }
2326 if ($upto){
2327 last if ($count > $upto);
2328 }
2329
2330 $counting{sequences_count}++;
2331 if ($counting{sequences_count}%1000000==0) {
2332 warn "Processed $counting{sequences_count} sequence pairs so far\n";
2333 }
2334
2335 my $orig_identifier_1 = $identifier_1;
2336 my $orig_identifier_2 = $identifier_2;
2337
2338 chomp $sequence_1;
2339 chomp $identifier_1;
2340 chomp $sequence_2;
2341 chomp $identifier_2;
2342 chomp $quality_value_1;
2343 chomp $quality_value_2;
2344
2345 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
2346
2347 my $return;
2348 if ($bowtie2){
2349 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
2350 }
2351 else{
2352 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
2353 }
2354
2355 unless ($return){
2356 $return = 0;
2357 }
2358
2359 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
2360 if ($ambiguous and $return == 2){
2361 # seq_1
2362 print AMBIG_1 $orig_identifier_1;
2363 print AMBIG_1 "$sequence_1\n";
2364 print AMBIG_1 $ident_1;
2365 print AMBIG_1 "$quality_value_1\n";
2366 # seq_2
2367 print AMBIG_2 $orig_identifier_2;
2368 print AMBIG_2 "$sequence_2\n";
2369 print AMBIG_2 $ident_2;
2370 print AMBIG_2 "$quality_value_2\n";
2371 }
2372
2373 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
2374 elsif ($unmapped and $return == 1){
2375 # seq_1
2376 print UNMAPPED_1 $orig_identifier_1;
2377 print UNMAPPED_1 "$sequence_1\n";
2378 print UNMAPPED_1 $ident_1;
2379 print UNMAPPED_1 "$quality_value_1\n";
2380 # seq_2
2381 print UNMAPPED_2 $orig_identifier_2;
2382 print UNMAPPED_2 "$sequence_2\n";
2383 print UNMAPPED_2 $ident_2;
2384 print UNMAPPED_2 "$quality_value_2\n";
2385 }
2386 }
2387
2388 warn "Processed $counting{sequences_count} sequences in total\n\n";
2389
2390 close OUT or die $!;
2391
2392 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
2393
2394 }
2395
2396 sub check_bowtie_results_single_end{
2397 my ($sequence,$identifier,$quality_value) = @_;
2398
2399 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
2400 $quality_value = 'I'x(length$sequence);
2401 }
2402
2403 my %mismatches = ();
2404 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
2405 foreach my $index (0..$#fhs){
2406
2407 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2408 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
2409 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
2410 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2411 ###############################################################
2412 ### STEP I Now processing the alignment stored in last_line ###
2413 ###############################################################
2414 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
2415 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
2416 ### we only continue to extract useful information about this alignment if 1 was returned
2417 if ($valid_alignment_found_1 == 1){
2418 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
2419 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
2420 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
2421
2422 unless($mismatch_info){
2423 $mismatch_info = '';
2424 }
2425
2426 chomp $mismatch_info;
2427 my $chromosome;
2428 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
2429 $chromosome = $mapped_chromosome;
2430 }
2431 else{
2432 die "Chromosome number extraction failed for $mapped_chromosome\n";
2433 }
2434 ### Now extracting the number of mismatches to the converted genome
2435 my $number_of_mismatches;
2436 if ($mismatch_info eq ''){
2437 $number_of_mismatches = 0;
2438 }
2439 elsif ($mismatch_info =~ /^\d/){
2440 my @mismatches = split (/,/,$mismatch_info);
2441 $number_of_mismatches = scalar @mismatches;
2442 }
2443 else{
2444 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
2445 }
2446 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2447 my $alignment_location = join (":",$chromosome,$position);
2448 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2449 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
2450 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
2451 ### number for the found alignment)
2452 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
2453 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
2454 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
2455 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
2456 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
2457 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
2458 }
2459 $number_of_mismatches = undef;
2460 ##################################################################################################################################################
2461 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
2462 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
2463 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
2464 ##################################################################################################################################################
2465 my $newline = $fhs[$index]->{fh}-> getline();
2466 if ($newline){
2467 my ($seq_id) = split (/\t/,$newline);
2468 $fhs[$index]->{last_seq_id} = $seq_id;
2469 $fhs[$index]->{last_line} = $newline;
2470 }
2471 else {
2472 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
2473 $fhs[$index]->{last_seq_id} = undef;
2474 $fhs[$index]->{last_line} = undef;
2475 next;
2476 }
2477 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
2478 ### we only continue to extract useful information about this second alignment if 1 was returned
2479 if ($valid_alignment_found_2 == 1){
2480 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
2481 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
2482 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
2483 unless($mismatch_info){
2484 $mismatch_info = '';
2485 }
2486 chomp $mismatch_info;
2487
2488 my $chromosome;
2489 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
2490 $chromosome = $mapped_chromosome;
2491 }
2492 else{
2493 die "Chromosome number extraction failed for $mapped_chromosome\n";
2494 }
2495
2496 ### Now extracting the number of mismatches to the converted genome
2497 my $number_of_mismatches;
2498 if ($mismatch_info eq ''){
2499 $number_of_mismatches = 0;
2500 }
2501 elsif ($mismatch_info =~ /^\d/){
2502 my @mismatches = split (/,/,$mismatch_info);
2503 $number_of_mismatches = scalar @mismatches;
2504 }
2505 else{
2506 die "Something weird is going on with the mismatch field\n";
2507 }
2508 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
2509 ### extracting the chromosome number from the bowtie output (see above)
2510 my $alignment_location = join (":",$chromosome,$position);
2511 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
2512 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
2513 ### case we are not writing the same entry out a second time.
2514 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
2515 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
2516 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
2517 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
2518 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
2519 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
2520 }
2521 ####################################################################################################################################
2522 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
2523 ####################################################################################################################################
2524 $newline = $fhs[$index]->{fh}-> getline();
2525 if ($newline){
2526 my ($seq_id) = split (/\t/,$newline);
2527 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
2528 $fhs[$index]->{last_seq_id} = $seq_id;
2529 $fhs[$index]->{last_line} = $newline;
2530 next;
2531 }
2532 else {
2533 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
2534 $fhs[$index]->{last_seq_id} = undef;
2535 $fhs[$index]->{last_line} = undef;
2536 next;
2537 }
2538 ### still within the 2nd sequence in correct orientation found
2539 }
2540 ### still withing the 1st sequence in correct orientation found
2541 }
2542 ### still within the if (last_seq_id eq identifier) condition
2543 }
2544 ### still within foreach index loop
2545 }
2546 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
2547 unless(%mismatches){
2548 $counting{no_single_alignment_found}++;
2549 if ($unmapped){
2550 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
2551 }
2552 else{
2553 return;
2554 }
2555 }
2556 #######################################################################################################################################################
2557 #######################################################################################################################################################
2558 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
2559 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
2560 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
2561 #######################################################################################################################################################
2562 #######################################################################################################################################################
2563 ### Going to use the variable $sequence_fails as a memory if a sequence could not be aligned uniquely (set to 1 then)
2564 my $sequence_fails = 0;
2565 ### Declaring an empty hash reference which will store all information we need for the methylation call
2566 my $methylation_call_params; # hash reference!
2567 ### sorting in ascending order
2568 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
2569
2570 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
2571 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
2572 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
2573 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
2574 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
2575 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
2576 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
2577 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
2578 }
2579 }
2580 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
2581 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
2582 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
2583 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
2584 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
2585 ### reaction. E.g.
2586 ### CAGTCACGCGCGCGCG will become
2587 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
2588 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
2589 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
2590 ### G->A conversion:
2591 ### highly methylated: CAATCACACACACACA
2592 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
2593 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
2594 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
2595 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
2596 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
2597 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
2598 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
2599 ### In the above example the number of transliterations required to transform the actual sequence
2600 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
2601 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
2602 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
2603 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
2604 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
2605 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
2606 my @three_candidate_seqs;
2607 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
2608 my $transliterations_performed;
2609 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
2610 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
2611 }
2612 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
2613 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
2614 }
2615 else{
2616 die "unexpected index number range $!\n";
2617 }
2618 push @three_candidate_seqs,{
2619 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
2620 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
2621 mismatch_number => $mismatch_number,
2622 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
2623 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
2624 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
2625 transliterations_performed => $transliterations_performed,
2626 };
2627 }
2628 ### sorting in ascending order for the lowest number of transliterations performed
2629 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
2630 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
2631 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
2632 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
2633 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
2634 if (($first_array_element*2) < $second_array_element){
2635 $counting{low_complexity_alignments_overruled_count}++;
2636 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
2637 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
2638 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
2639 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
2640 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
2641 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
2642 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
2643 }
2644 else{
2645 $sequence_fails = 1;
2646 }
2647 }
2648 else{
2649 $sequence_fails = 1;
2650 }
2651 ### after processing the alignment with the lowest number of mismatches we exit
2652 last;
2653 }
2654 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
2655 if ($sequence_fails == 1){
2656 $counting{unsuitable_sequence_count}++;
2657 if ($ambiguous){
2658 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
2659 }
2660 if ($unmapped){
2661 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
2662 }
2663 else{
2664 return 0; # => exits to next sequence (default)
2665 }
2666 }
2667
2668 ### --DIRECTIONAL
2669 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2670 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2671 if ($directional){
2672 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
2673 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2674 $counting{alignments_rejected_count}++;
2675 return 0;
2676 }
2677 }
2678
2679 ### If the sequence has not been rejected so far it will have a unique best alignment
2680 $counting{unique_best_alignment_count}++;
2681 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
2682
2683 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
2684 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
2685 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
2686 $counting{genomic_sequence_could_not_be_extracted_count}++;
2687 return 0;
2688 }
2689
2690 ### otherwise we are set to perform the actual methylation call
2691 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
2692
2693 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
2694 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
2695 }
2696
2697 sub check_bowtie_results_single_end_bowtie2{
2698 my ($sequence,$identifier,$quality_value) = @_;
2699
2700
2701 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
2702 $quality_value = 'I'x(length$sequence);
2703 }
2704
2705 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
2706 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
2707 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n";
2708
2709 my $alignment_ambiguous = 0;
2710 my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far
2711 my $amb_same_thread = 0; ## if a reads primary and secondary alignments have the same alignment score we set this to true.
2712
2713 my %alignments = ();
2714
2715 ### reading from the Bowtie 2 output filehandles
2716 foreach my $index (0..$#fhs){
2717 # print "Index: $index\n";
2718 # print "$fhs[$index]->{last_line}\n";
2719 # print "$fhs[$index]->{last_seq_id}\n";
2720 # sleep (1);
2721 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2722 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
2723
2724 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
2725 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
2726
2727 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2728 # SAM format specifications for Bowtie 2
2729 # (1) Name of read that aligned
2730 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
2731 # 1 The read is one of a pair
2732 # 2 The alignment is one end of a proper paired-end alignment
2733 # 4 The read has no reported alignments
2734 # 8 The read is one of a pair and has no reported alignments
2735 # 16 The alignment is to the reverse reference strand
2736 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
2737 # 64 The read is mate 1 in a pair
2738 # 128 The read is mate 2 in a pair
2739 # 256 The read has multiple mapping states
2740 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
2741 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
2742 # (5) Mapping quality (255 means MAPQ is not available)
2743 # (6) CIGAR string representation of alignment (* if unavailable)
2744 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
2745 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
2746 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
2747 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
2748 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
2749 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
2750 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
2751 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
2752 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
2753 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
2754 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
2755 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2756 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2757 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
2758 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
2759 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
2760
2761 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
2762
2763 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
2764 if ($flag == 4){
2765 ## reading in the next alignment, which must be the next sequence
2766 my $newline = $fhs[$index]->{fh}-> getline();
2767 if ($newline){
2768 chomp $newline;
2769 my ($seq_id) = split (/\t/,$newline);
2770 $fhs[$index]->{last_seq_id} = $seq_id;
2771 $fhs[$index]->{last_line} = $newline;
2772 if ($seq_id eq $identifier){
2773 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
2774 }
2775 next; # next instance
2776 }
2777 else{
2778 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2779 $fhs[$index]->{last_seq_id} = undef;
2780 $fhs[$index]->{last_line} = undef;
2781 next;
2782 }
2783 }
2784
2785 # if there are one or more proper alignments we can extract the chromosome number
2786 my $chromosome;
2787 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
2788 $chromosome = $mapped_chromosome;
2789 }
2790 else{
2791 die "Chromosome number extraction failed for $mapped_chromosome\n";
2792 }
2793
2794 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
2795 my ($alignment_score,$second_best,$MD_tag);
2796 my @fields = split (/\t/,$fhs[$index]->{last_line});
2797
2798 foreach (11..$#fields){
2799 if ($fields[$_] =~ /AS:i:(.*)/){
2800 $alignment_score = $1;
2801 }
2802 elsif ($fields[$_] =~ /XS:i:(.*)/){
2803 $second_best = $1;
2804 }
2805 elsif ($fields[$_] =~ /MD:Z:(.*)/){
2806 $MD_tag = $1;
2807 }
2808 }
2809
2810 if (!defined $best_AS_so_far){
2811 $best_AS_so_far = $alignment_score;
2812 # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n";
2813 }
2814 else{
2815 if ($alignment_score > $best_AS_so_far){ # AS are generally negative with a maximum of 0
2816 $best_AS_so_far = $alignment_score;
2817 # warn "Found better alignment score ($alignment_score), setting \$best_AS_so_far to $best_AS_so_far\n";
2818 # resetting the ambiguous within thread memory (if applicable at all)
2819 # warn "Resetting amb within thread value to 0\n";
2820 $amb_same_thread = 0;
2821 }
2822 else{
2823 # warn "current alignment (AS $alignment_score) isn't better than the best so far ($best_AS_so_far). Not changing anything\n";
2824 }
2825 }
2826
2827 # warn "First best alignment_score is: '$alignment_score'\n";
2828 # warn "MD tag is: '$MD_tag'\n";
2829 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag) from line $fhs[$index]->{last_line}!\n" unless (defined $alignment_score and defined $MD_tag);
2830
2831 if (defined $second_best){
2832 # warn "second best alignment_score is: '$second_best'\n\n";
2833
2834 # If the first alignment score is the same as the alignment score of the second best hit we keep a memory of this
2835 if ($alignment_score == $second_best){
2836
2837 # checking to see if this read produced the best alignment
2838 if ($alignment_score == $best_AS_so_far){ # yes this read is the best one so far, however it is ambiguous
2839 # warn "Read is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n";
2840 $amb_same_thread = 1;
2841 }
2842 else{
2843 # warn "This read has a worse alignments score than the best alignment so far and will be ignored even though it is ambiguous in itself\n";
2844 }
2845 ### if there is a better alignment later on -> fine. If not, the read will get booted altogether
2846
2847 ## need to read and discard all additional ambiguous reads until we reach the next sequence
2848 until ($fhs[$index]->{last_seq_id} ne $identifier){
2849 my $newline = $fhs[$index]->{fh}-> getline();
2850 if ($newline){
2851 chomp $newline;
2852 my ($seq_id) = split (/\t/,$newline);
2853 $fhs[$index]->{last_seq_id} = $seq_id;
2854 $fhs[$index]->{last_line} = $newline;
2855 }
2856 else{
2857 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2858 $fhs[$index]->{last_seq_id} = undef;
2859 $fhs[$index]->{last_line} = undef;
2860 last; # break free in case we have reached the end of the alignment output
2861 }
2862 }
2863 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
2864 }
2865 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
2866
2867 my $alignment_location = join (":",$chromosome,$position);
2868
2869 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2870 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2871 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2872 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
2873
2874 unless (exists $alignments{$alignment_location}){
2875 $alignments{$alignment_location}->{seq_id} = $id;
2876 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
2877 $alignments{$alignment_location}->{alignment_score_second_best} = $second_best;
2878 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
2879 $alignments{$alignment_location}->{index} = $index;
2880 $alignments{$alignment_location}->{chromosome} = $chromosome;
2881 $alignments{$alignment_location}->{position} = $position;
2882 $alignments{$alignment_location}->{CIGAR} = $cigar;
2883 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
2884 }
2885
2886 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
2887 until ($fhs[$index]->{last_seq_id} ne $identifier){
2888 my $newline = $fhs[$index]->{fh}-> getline();
2889 if ($newline){
2890 chomp $newline;
2891 my ($seq_id) = split (/\t/,$newline);
2892 $fhs[$index]->{last_seq_id} = $seq_id;
2893 $fhs[$index]->{last_line} = $newline;
2894 }
2895 else{
2896 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2897 $fhs[$index]->{last_seq_id} = undef;
2898 $fhs[$index]->{last_line} = undef;
2899 last; # break free in case we have reached the end of the alignment output
2900 }
2901 }
2902 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
2903 }
2904 }
2905 else{ # there is no second best hit, so we can just store this one and read in the next sequence
2906
2907 my $alignment_location = join (":",$chromosome,$position);
2908
2909 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2910 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2911 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2912 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
2913
2914 unless (exists $alignments{$alignment_location}){
2915 $alignments{$alignment_location}->{seq_id} = $id;
2916 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
2917 $alignments{$alignment_location}->{alignment_score_second_best} = undef;
2918 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
2919 $alignments{$alignment_location}->{index} = $index;
2920 $alignments{$alignment_location}->{chromosome} = $chromosome;
2921 $alignments{$alignment_location}->{position} = $position;
2922 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
2923 $alignments{$alignment_location}->{CIGAR} = $cigar;
2924 }
2925
2926 my $newline = $fhs[$index]->{fh}-> getline();
2927 if ($newline){
2928 chomp $newline;
2929 my ($seq_id) = split (/\t/,$newline);
2930 $fhs[$index]->{last_seq_id} = $seq_id;
2931 $fhs[$index]->{last_line} = $newline;
2932 if ($seq_id eq $identifier){
2933 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
2934 }
2935 }
2936 else{
2937 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2938 $fhs[$index]->{last_seq_id} = undef;
2939 $fhs[$index]->{last_line} = undef;
2940 }
2941 }
2942 }
2943 }
2944
2945 ### If there were several equally good alignments for the best alignment score we will boot the read
2946 if ($amb_same_thread){
2947 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n";
2948 $alignment_ambiguous = 1;
2949 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n";
2950 }
2951 else{
2952 # warn "alignment won't be considered ambiguous. This time....\n";
2953 }
2954
2955 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
2956 if ($alignment_ambiguous == 1){
2957 $counting{unsuitable_sequence_count}++;
2958 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2959 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
2960 # print "$ambiguous_read_output\n";
2961
2962 if ($ambiguous){
2963 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
2964 }
2965 elsif ($unmapped){
2966 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
2967 }
2968 else{
2969 return 0;
2970 }
2971 }
2972
2973 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
2974 unless(%alignments){
2975 $counting{no_single_alignment_found}++;
2976 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
2977 # print "$unmapped_read_output\n";
2978 if ($unmapped){
2979 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
2980 }
2981 else{
2982 return 0; # default
2983 }
2984 }
2985
2986 #######################################################################################################################################################
2987
2988 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
2989 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
2990 ### alignment score we are discarding the sequence altogether.
2991 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
2992 ### opening (5) and extending (3 per bp) the gap.
2993
2994 #######################################################################################################################################################
2995
2996 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
2997 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2998
2999 ### print contents of %alignments for debugging
3000 # if (scalar keys %alignments > 1){
3001 # print "\n******\n";
3002 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
3003 # print "Loc: $alignment_location\n";
3004 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
3005 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
3006 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
3007 # print "Index $alignments{$alignment_location}->{index}\n";
3008 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
3009 # print "pos: $alignments{$alignment_location}->{position}\n";
3010 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
3011 # }
3012 # print "\n******\n";
3013 # }
3014
3015 ### if there is only 1 entry in the hash with we accept it as the best alignment
3016 if (scalar keys %alignments == 1){
3017 for my $unique_best_alignment (keys %alignments){
3018 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
3019 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
3020 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
3021 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
3022 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
3023 $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$unique_best_alignment}->{alignment_score_second_best};
3024 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
3025 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
3026 }
3027 }
3028
3029 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
3030 ### we boot the sequence altogether
3031 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
3032 my $best_alignment_score;
3033 my $best_alignment_location;
3034 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
3035 # print "$alignments{$alignment_location}->{alignment_score}\n";
3036 unless (defined $best_alignment_score){
3037 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
3038 $best_alignment_location = $alignment_location;
3039 # print "setting best alignment score: $best_alignment_score\n";
3040 }
3041 else{
3042 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
3043 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
3044 # warn "Same alignment score, the sequence will get booted!\n";
3045 $sequence_fails = 1;
3046 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
3047 }
3048 ### else we are going to store the best alignment for further processing
3049 else{
3050 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
3051 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
3052 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
3053 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
3054 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
3055 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
3056 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
3057 if (defined $alignments{$best_alignment_location}->{alignment_score_second_best} and $alignments{$best_alignment_location}-> {alignment_score_second_best} > $alignments{$alignment_location}->{alignment_score}) {
3058 $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$best_alignment_location}->{alignment_score_second_best};
3059 }
3060 else {
3061 $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$alignment_location}->{alignment_score};
3062 }
3063 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
3064 }
3065 }
3066 }
3067 }
3068 else{
3069 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
3070 }
3071
3072 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
3073 if ($sequence_fails == 1){
3074 $counting{unsuitable_sequence_count}++;
3075
3076 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
3077 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
3078 # print OUT "$ambiguous_read_output\n";
3079
3080 if ($ambiguous){
3081 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
3082 }
3083 elsif ($unmapped){
3084 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
3085 }
3086 else{
3087 return 0; # => exits to next sequence (default)
3088 }
3089 }
3090
3091 ### --DIRECTIONAL
3092 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
3093 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
3094 if ($directional){
3095 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
3096 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
3097 $counting{alignments_rejected_count}++;
3098 return 0;
3099 }
3100 }
3101
3102 ### If the sequence has not been rejected so far it has a unique best alignment
3103 $counting{unique_best_alignment_count}++;
3104
3105 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
3106 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
3107
3108 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
3109 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
3110 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
3111 $counting{genomic_sequence_could_not_be_extracted_count}++;
3112 return 0;
3113 }
3114
3115 # Compute MAPQ value
3116 $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence), undef,
3117 $methylation_call_params->{$identifier}->{alignment_score},
3118 $methylation_call_params->{$identifier}->{alignment_score_second_best});
3119
3120
3121
3122 ### otherwise we are set to perform the actual methylation call
3123 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
3124 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
3125 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
3126 }
3127
3128
3129 sub determine_number_of_transliterations_performed{
3130 my ($sequence,$read_conversion) = @_;
3131 my $number_of_transliterations;
3132 if ($read_conversion eq 'CT'){
3133 $number_of_transliterations = $sequence =~ tr/C/T/;
3134 }
3135 elsif ($read_conversion eq 'GA'){
3136 $number_of_transliterations = $sequence =~ tr/G/A/;
3137 }
3138 else{
3139 die "Read conversion mode of the read was not specified $!\n";
3140 }
3141 return $number_of_transliterations;
3142 }
3143
3144 sub decide_whether_single_end_alignment_is_valid{
3145 my ($index,$identifier) = @_;
3146
3147 # extracting from Bowtie 1 format
3148 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
3149
3150 ### ensuring that the entry is the correct sequence
3151 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
3152 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
3153 ### sensible alignments
3154 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
3155 ### If the orientation was correct can we move on
3156 if ($orientation == 1){
3157 return 1; ### 1st possibility for a sequence to pass
3158 }
3159 ### If the alignment was in the wrong orientation we need to read in a new line
3160 elsif($orientation == 0){
3161 my $newline = $fhs[$index]->{fh}->getline();
3162 if ($newline){
3163 ($id,$strand) = (split (/\t/,$newline))[0,1];
3164
3165 ### ensuring that the next entry is still the correct sequence
3166 if ($id eq $identifier){
3167 ### checking orientation again
3168 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
3169 ### If the orientation was correct can we move on
3170 if ($orientation == 1){
3171 $fhs[$index]->{last_seq_id} = $id;
3172 $fhs[$index]->{last_line} = $newline;
3173 return 1; ### 2nd possibility for a sequence to pass
3174 }
3175 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
3176 elsif ($orientation == 0){
3177 $newline = $fhs[$index]->{fh}->getline();
3178 if ($newline){
3179 my ($seq_id) = split (/\t/,$newline);
3180 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
3181 ### the same fields of the just read next entry
3182 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
3183 $fhs[$index]->{last_seq_id} = $seq_id;
3184 $fhs[$index]->{last_line} = $newline;
3185 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
3186 }
3187 else{
3188 # assigning undef to last_seq_id and last_line (end of bowtie output)
3189 $fhs[$index]->{last_seq_id} = undef;
3190 $fhs[$index]->{last_line} = undef;
3191 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
3192 }
3193 }
3194 else{
3195 die "The orientation of the alignment must be either correct or incorrect\n";
3196 }
3197 }
3198 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
3199 else{
3200 $fhs[$index]->{last_seq_id} = $id;
3201 $fhs[$index]->{last_line} = $newline;
3202 return 0; # processing the new alignment result only in the next round
3203 }
3204 }
3205 else {
3206 # assigning undef to last_seq_id and last_line (end of bowtie output)
3207 $fhs[$index]->{last_seq_id} = undef;
3208 $fhs[$index]->{last_line} = undef;
3209 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
3210 }
3211 }
3212 else{
3213 die "The orientation of the alignment must be either correct or incorrect\n";
3214 }
3215 }
3216 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
3217 else{
3218 return 0;
3219 }
3220 }
3221 #########################
3222 ### BOWTIE 1 | PAIRED-END
3223 #########################
3224
3225 sub check_bowtie_results_paired_ends{
3226 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
3227
3228 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
3229 unless ($quality_value_1){
3230 $quality_value_1 = 'I'x(length$sequence_1);
3231 }
3232 unless ($quality_value_2){
3233 $quality_value_2 = 'I'x(length$sequence_2);
3234 }
3235
3236 # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
3237 # sleep (1);
3238 my %mismatches = ();
3239 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
3240
3241
3242 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
3243 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
3244 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
3245 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
3246 ### strands are not being reported by specifying --directional
3247
3248 foreach my $index (0,3,1,2){
3249 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
3250 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
3251 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
3252 if ($fhs[$index]->{last_seq_id} eq $identifier) {
3253 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
3254
3255 ##################################################################################
3256 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
3257 ##################################################################################
3258 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
3259 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
3260 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
3261 if ($valid_alignment_found == 1){
3262 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
3263 ### we store the useful information in %mismatches
3264 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
3265 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
3266 chomp $mismatch_info_1;
3267 chomp $mismatch_info_2;
3268
3269 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
3270 my ($chromosome_1,$chromosome_2);
3271 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
3272 $chromosome_1 = $mapped_chromosome_1;
3273 }
3274 else{
3275 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
3276 }
3277 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
3278 $chromosome_2 = $mapped_chromosome_2;
3279 }
3280 else{
3281 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
3282 }
3283
3284 ### Now extracting the number of mismatches to the converted genome
3285 my $number_of_mismatches_1;
3286 my $number_of_mismatches_2;
3287 if ($mismatch_info_1 eq ''){
3288 $number_of_mismatches_1 = 0;
3289 }
3290 elsif ($mismatch_info_1 =~ /^\d/){
3291 my @mismatches = split (/,/,$mismatch_info_1);
3292 $number_of_mismatches_1 = scalar @mismatches;
3293 }
3294 else{
3295 die "Something weird is going on with the mismatch field\n";
3296 }
3297 if ($mismatch_info_2 eq ''){
3298 $number_of_mismatches_2 = 0;
3299 }
3300 elsif ($mismatch_info_2 =~ /^\d/){
3301 my @mismatches = split (/,/,$mismatch_info_2);
3302 $number_of_mismatches_2 = scalar @mismatches;
3303 }
3304 else{
3305 die "Something weird is going on with the mismatch field\n";
3306 }
3307 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
3308 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
3309 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
3310 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
3311 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
3312 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
3313 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
3314 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
3315 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
3316 ### number for the found alignment)
3317 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
3318 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
3319 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
3320 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
3321 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
3322 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
3323 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
3324 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
3325 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
3326 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
3327 }
3328 ###################################################################################################################################################
3329 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
3330 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
3331 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
3332 ### this round ###
3333 ###################################################################################################################################################
3334 my $newline_1 = $fhs[$index]->{fh}-> getline();
3335 my $newline_2 = $fhs[$index]->{fh}-> getline();
3336
3337 if ($newline_1 and $newline_2){
3338 my ($seq_id_1) = split (/\t/,$newline_1);
3339 my ($seq_id_2) = split (/\t/,$newline_2);
3340
3341 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
3342 $fhs[$index]->{last_seq_id} = $seq_id_1;
3343 }
3344 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
3345 $fhs[$index]->{last_seq_id} = $seq_id_2;
3346 }
3347 else{
3348 die "Either read 1 or read 2 needs to end on '/1'\n";
3349 }
3350
3351 $fhs[$index]->{last_line_1} = $newline_1;
3352 $fhs[$index]->{last_line_2} = $newline_2;
3353 }
3354 else {
3355 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
3356 $fhs[$index]->{last_seq_id} = undef;
3357 $fhs[$index]->{last_line_1} = undef;
3358 $fhs[$index]->{last_line_2} = undef;
3359 next; # jumping to the next index
3360 }
3361 ### Now processing the entry we just stored in last_line_1 and last_line_2
3362 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
3363 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
3364 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
3365 if ($valid_alignment_found == 1){
3366 ### we store the useful information in %mismatches
3367 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
3368 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
3369 chomp $mismatch_info_1;
3370 chomp $mismatch_info_2;
3371 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
3372 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
3373 $chromosome_1 = $mapped_chromosome_1;
3374 }
3375 else{
3376 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
3377 }
3378 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
3379 $chromosome_2 = $mapped_chromosome_2;
3380 }
3381 else{
3382 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
3383 }
3384
3385 $number_of_mismatches_1='';
3386 $number_of_mismatches_2='';
3387 ### Now extracting the number of mismatches to the converted genome
3388 if ($mismatch_info_1 eq ''){
3389 $number_of_mismatches_1 = 0;
3390 }
3391 elsif ($mismatch_info_1 =~ /^\d/){
3392 my @mismatches = split (/,/,$mismatch_info_1);
3393 $number_of_mismatches_1 = scalar @mismatches;
3394 }
3395 else{
3396 die "Something weird is going on with the mismatch field\n";
3397 }
3398 if ($mismatch_info_2 eq ''){
3399 $number_of_mismatches_2 = 0;
3400 }
3401 elsif ($mismatch_info_2 =~ /^\d/){
3402 my @mismatches = split (/,/,$mismatch_info_2);
3403 $number_of_mismatches_2 = scalar @mismatches;
3404 }
3405 else{
3406 die "Something weird is going on with the mismatch field\n";
3407 }
3408 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
3409 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
3410 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
3411 die "position 1 is greater than position 2" if ($position_1 > $position_2);
3412 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
3413 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
3414 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
3415 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
3416 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
3417 ### number for the found alignment)
3418 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
3419 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
3420 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
3421 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
3422 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
3423 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
3424 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
3425 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
3426 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
3427 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
3428 }
3429 ###############################################################################################################################################
3430 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
3431 ###############################################################################################################################################
3432 $newline_1 = $fhs[$index]->{fh}-> getline();
3433 $newline_2 = $fhs[$index]->{fh}-> getline();
3434
3435 if ($newline_1 and $newline_2){
3436 my ($seq_id_1) = split (/\t/,$newline_1);
3437 my ($seq_id_2) = split (/\t/,$newline_2);
3438
3439 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
3440 $fhs[$index]->{last_seq_id} = $seq_id_1;
3441 }
3442 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
3443 $fhs[$index]->{last_seq_id} = $seq_id_2;
3444 }
3445 $fhs[$index]->{last_line_1} = $newline_1;
3446 $fhs[$index]->{last_line_2} = $newline_2;
3447 }
3448 else {
3449 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
3450 $fhs[$index]->{last_seq_id} = undef;
3451 $fhs[$index]->{last_line_1} = undef;
3452 $fhs[$index]->{last_line_2} = undef;
3453 next; # jumping to the next index
3454 }
3455 ### within the 2nd sequence pair alignment in correct orientation found
3456 }
3457 ### within the 1st sequence pair alignment in correct orientation found
3458 }
3459 ### still within the (last_seq_id eq identifier) condition
3460 }
3461 ### still within foreach index loop
3462 }
3463 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
3464 unless(%mismatches){
3465 $counting{no_single_alignment_found}++;
3466 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
3467 }
3468 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
3469 my $sequence_pair_fails = 0;
3470 ### Declaring an empty hash reference which will store all information we need for the methylation call
3471 my $methylation_call_params; # hash reference!
3472 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
3473 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
3474 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
3475 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
3476 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
3477 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
3478 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
3479 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
3480 }
3481 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
3482 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
3483 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
3484 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
3485 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
3486 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
3487 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
3488 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
3489 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
3490 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
3491 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
3492 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
3493 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
3494 }
3495 }
3496 else{
3497 $sequence_pair_fails = 1;
3498 }
3499 ### after processing the alignment with the lowest number of mismatches we exit
3500 last;
3501 }
3502 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
3503 if ($sequence_pair_fails == 1){
3504 $counting{unsuitable_sequence_count}++;
3505 if ($ambiguous){
3506 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
3507 }
3508 if ($unmapped){
3509 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
3510 }
3511 else{
3512 return 0; # => exits to next sequence (default)
3513 }
3514 }
3515
3516 ### --DIRECTIONAL
3517 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
3518 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
3519 if ($directional){
3520 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
3521 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
3522 $counting{alignments_rejected_count}++;
3523 return 0;
3524 }
3525 }
3526
3527 ### If the sequence has not been rejected so far it does have a unique best alignment
3528 $counting{unique_best_alignment_count}++;
3529 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
3530
3531 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
3532 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
3533 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
3534 $counting{genomic_sequence_could_not_be_extracted_count}++;
3535 return 0;
3536 }
3537 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
3538 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
3539 $counting{genomic_sequence_could_not_be_extracted_count}++;
3540 return 0;
3541 }
3542
3543 ### otherwise we are set to perform the actual methylation call
3544 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
3545 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
3546
3547 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
3548 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
3549 }
3550
3551 #########################
3552 ### BOWTIE 2 | PAIRED-END
3553 #########################
3554
3555 sub check_bowtie_results_paired_ends_bowtie2{
3556 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
3557
3558 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
3559 unless ($quality_value_1){
3560 $quality_value_1 = 'I'x(length$sequence_1);
3561 }
3562
3563 unless ($quality_value_2){
3564 $quality_value_2 = 'I'x(length$sequence_2);
3565 }
3566
3567
3568 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
3569
3570
3571 my %alignments;
3572 my $alignment_ambiguous = 0;
3573 my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far
3574 my $amb_same_thread = 0; ## if a reads primary and secondary alignments have the same alignment score we set this to true.
3575
3576 ### reading from the Bowtie 2 output filehandles
3577
3578 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
3579 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
3580 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
3581 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
3582 ### strands are not being reported when '--directional' is specified
3583
3584 foreach my $index (0,3,1,2){
3585 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
3586 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
3587
3588 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
3589 if ($fhs[$index]->{last_seq_id} eq $identifier) {
3590
3591 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
3592 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
3593 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
3594 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
3595 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
3596 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
3597 $id_1 =~ s/\/1$//;
3598 $id_2 =~ s/\/2$//;
3599
3600 # SAM format specifications for Bowtie 2
3601 # (1) Name of read that aligned
3602 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
3603 # 1 The read is one of a pair
3604 # 2 The alignment is one end of a proper paired-end alignment
3605 # 4 The read has no reported alignments
3606 # 8 The read is one of a pair and has no reported alignments
3607 # 16 The alignment is to the reverse reference strand
3608 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
3609 # 64 The read is mate 1 in a pair
3610 # 128 The read is mate 2 in a pair
3611 # 256 The read has multiple mapping states
3612 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
3613 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
3614 # (5) Mapping quality (255 means MAPQ is not available)
3615 # (6) CIGAR string representation of alignment (* if unavailable)
3616 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
3617 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
3618 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
3619 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
3620 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
3621 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
3622 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
3623 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
3624 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
3625 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
3626 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
3627 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
3628 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
3629 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
3630 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
3631 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
3632
3633 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
3634 ### We can store the next alignment and move on to the next Bowtie 2 instance
3635 if ($flag_1 == 77 and $flag_2 == 141){
3636 ## reading in the next alignment, which must be the next sequence
3637 my $newline_1 = $fhs[$index]->{fh}-> getline();
3638 my $newline_2 = $fhs[$index]->{fh}-> getline();
3639
3640 if ($newline_1 and $newline_2){
3641 chomp $newline_1;
3642 chomp $newline_2;
3643 my ($seq_id_1) = split (/\t/,$newline_1);
3644 my ($seq_id_2) = split (/\t/,$newline_2);
3645 $seq_id_1 =~ s/\/1$//;
3646 $seq_id_2 =~ s/\/2$//;
3647 $fhs[$index]->{last_seq_id} = $seq_id_1;
3648 $fhs[$index]->{last_line_1} = $newline_1;
3649 $fhs[$index]->{last_line_2} = $newline_2;
3650
3651 # print "current sequence ($identifier) did not map, reading in next sequence\n";
3652 # print "$index\t$fhs[$index]->{last_seq_id}\n";
3653 # print "$index\t$fhs[$index]->{last_line_1}\n";
3654 # print "$index\t$fhs[$index]->{last_line_2}\n";
3655 next; # next instance
3656 }
3657 else{
3658 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
3659 $fhs[$index]->{last_seq_id} = undef;
3660 $fhs[$index]->{last_line_1} = undef;
3661 $fhs[$index]->{last_line_2} = undef;
3662 next;
3663 }
3664 }
3665
3666 ### If there are one or more proper alignments we can extract the chromosome number
3667 my ($chromosome_1,$chromosome_2);
3668 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
3669 $chromosome_1 = $mapped_chromosome_1;
3670 }
3671 else{
3672 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
3673 }
3674 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
3675 $chromosome_2 = $mapped_chromosome_2;
3676 }
3677 else{
3678 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
3679 }
3680
3681 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
3682
3683 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
3684 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
3685
3686 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
3687 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
3688
3689 foreach (11..$#fields_1){
3690 if ($fields_1[$_] =~ /AS:i:(.*)/){
3691 $alignment_score_1 = $1;
3692 }
3693 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
3694 $second_best_1 = $1;
3695 }
3696 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
3697 $MD_tag_1 = $1;
3698 }
3699 }
3700
3701 foreach (11..$#fields_2){
3702 if ($fields_2[$_] =~ /AS:i:(.*)/){
3703 $alignment_score_2 = $1;
3704 }
3705 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
3706 $second_best_2 = $1;
3707 }
3708 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
3709 $MD_tag_2 = $1;
3710 }
3711 }
3712
3713 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
3714 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
3715
3716 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
3717 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
3718 # warn "MD tag 1 is: '$MD_tag_1'\n";
3719 # warn "MD tag 2 is: '$MD_tag_2'\n";
3720
3721 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
3722 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
3723 # warn "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
3724
3725 if (!defined $best_AS_so_far){
3726 $best_AS_so_far = $sum_of_alignment_scores_1;
3727 # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n";
3728 }
3729 else{
3730 if ($sum_of_alignment_scores_1 > $best_AS_so_far){ # AS are generally negative with a maximum of 0
3731 $best_AS_so_far = $sum_of_alignment_scores_1;
3732 # warn "Found better sum of alignment scores ($sum_of_alignment_scores), setting \$best_AS_so_far to $best_AS_so_far\n";
3733 # resetting the ambiguous within thread memory (if applicable at all)
3734 # warn "Resetting amb within thread value to 0\n";
3735 $amb_same_thread = 0;
3736 }
3737 else{
3738 # warn "current alignment (AS $sum_of_alignment_scores) isn't better than the best so far ($best_AS_so_far). Not changing anything\n";
3739 }
3740 }
3741
3742 if (defined $second_best_1 and defined $second_best_2){
3743 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
3744 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
3745 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
3746 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
3747
3748 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we we keep a memory of this
3749 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
3750
3751 # checking to see if this read pair produced the best alignment
3752 if ($sum_of_alignment_scores_1 == $best_AS_so_far){ # yes this is the best read pair so far, either within the thread or between threads, however it is ambiguous
3753 # warn "Read pair is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n";
3754 $amb_same_thread = 1;
3755 }
3756 else{
3757 # warn "This read pair has a worse alignment score than the best alignment so far and will be ignored even though it is ambiguous in itself\n";
3758 }
3759
3760 ### if there is a better alignment later on -> fine. If not, the read will get booted altogether one way or another
3761
3762 ## need to read and discard all additional ambiguous reads until we reach the next sequence
3763 until ($fhs[$index]->{last_seq_id} ne $identifier){
3764 my $newline_1 = $fhs[$index]->{fh}-> getline();
3765 my $newline_2 = $fhs[$index]->{fh}-> getline();
3766 if ($newline_1 and $newline_2){
3767 chomp $newline_1;
3768 chomp $newline_2;
3769 my ($seq_id_1) = split (/\t/,$newline_1);
3770 my ($seq_id_2) = split (/\t/,$newline_2);
3771 $seq_id_1 =~ s/\/1$//;
3772 $seq_id_2 =~ s/\/2$//;
3773 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
3774
3775 $fhs[$index]->{last_seq_id} = $seq_id_1;
3776 $fhs[$index]->{last_line_1} = $newline_1;
3777 $fhs[$index]->{last_line_2} = $newline_2;
3778 }
3779 else{
3780 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
3781 $fhs[$index]->{last_seq_id} = undef;
3782 $fhs[$index]->{last_line_1} = undef;
3783 $fhs[$index]->{last_line_2} = undef;
3784 last; # break free if the end of the alignment output was reached
3785 }
3786 }
3787 # if ($fhs[$index]->{last_seq_id}){
3788 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
3789 # }
3790 }
3791 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
3792
3793 my $alignment_location;
3794 if ($position_1 <= $position_2){
3795 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
3796 }
3797 elsif($position_2 < $position_1){
3798 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
3799 }
3800
3801 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
3802 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
3803 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
3804 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
3805
3806 unless (exists $alignments{$alignment_location}){
3807 $alignments{$alignment_location}->{seq_id} = $id_1;
3808 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
3809 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
3810 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
3811 $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = $sum_of_alignment_scores_second_best;
3812 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
3813 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
3814 $alignments{$alignment_location}->{index} = $index;
3815 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
3816 $alignments{$alignment_location}->{position_1} = $position_1;
3817 $alignments{$alignment_location}->{position_2} = $position_2;
3818 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
3819 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
3820 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
3821 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
3822 $alignments{$alignment_location}->{flag_1} = $flag_1;
3823 $alignments{$alignment_location}->{flag_2} = $flag_2;
3824 }
3825 # warn "added best of several alignments to \%alignments hash\n";
3826
3827 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
3828 until ($fhs[$index]->{last_seq_id} ne $identifier){
3829 my $newline_1 = $fhs[$index]->{fh}-> getline();
3830 my $newline_2 = $fhs[$index]->{fh}-> getline();
3831 if ($newline_1 and $newline_2){
3832 chomp $newline_1;
3833 chomp $newline_2;
3834 my ($seq_id_1) = split (/\t/,$newline_1);
3835 my ($seq_id_2) = split (/\t/,$newline_2);
3836 $seq_id_1 =~ s/\/1$//;
3837 $seq_id_2 =~ s/\/2$//;
3838 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
3839
3840 $fhs[$index]->{last_seq_id} = $seq_id_1;
3841 $fhs[$index]->{last_line_1} = $newline_1;
3842 $fhs[$index]->{last_line_2} = $newline_2;
3843 }
3844 else{
3845 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
3846 $fhs[$index]->{last_seq_id} = undef;
3847 $fhs[$index]->{last_line_1} = undef;
3848 $fhs[$index]->{last_line_2} = undef;
3849 last; # break free if the end of the alignment output was reached
3850 }
3851 }
3852 # if($fhs[$index]->{last_seq_id}){
3853 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
3854 # }
3855 }
3856 }
3857 else{ # there is no second best hit, so we can just store this one and read in the next sequence
3858
3859 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
3860 # print "$alignment_location\n";
3861 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
3862 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
3863 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
3864 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
3865
3866 unless (exists $alignments{$alignment_location}){
3867 $alignments{$alignment_location}->{seq_id} = $id_1;
3868 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
3869 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
3870 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
3871 $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = undef;
3872 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
3873 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
3874 $alignments{$alignment_location}->{index} = $index;
3875 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
3876 $alignments{$alignment_location}->{position_1} = $position_1;
3877 $alignments{$alignment_location}->{position_2} = $position_2;
3878 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
3879 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
3880 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
3881 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
3882 $alignments{$alignment_location}->{flag_1} = $flag_1;
3883 $alignments{$alignment_location}->{flag_2} = $flag_2;
3884 }
3885
3886 # warn "added unique alignment to \%alignments hash\n";
3887
3888 # Now reading and storing the next read pair
3889 my $newline_1 = $fhs[$index]->{fh}-> getline();
3890 my $newline_2 = $fhs[$index]->{fh}-> getline();
3891 if ($newline_1 and $newline_2){
3892 chomp $newline_1;
3893 chomp $newline_2;
3894 # print "$newline_1\n";
3895 # print "$newline_2\n";
3896 my ($seq_id_1) = split (/\t/,$newline_1);
3897 my ($seq_id_2) = split (/\t/,$newline_2);
3898 $seq_id_1 =~ s/\/1$//;
3899 $seq_id_2 =~ s/\/2$//;
3900 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
3901
3902 $fhs[$index]->{last_seq_id} = $seq_id_1;
3903 $fhs[$index]->{last_line_1} = $newline_1;
3904 $fhs[$index]->{last_line_2} = $newline_2;
3905
3906 if ($seq_id_1 eq $identifier){
3907 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
3908 }
3909 }
3910 else{
3911 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
3912 $fhs[$index]->{last_seq_id} = undef;
3913 $fhs[$index]->{last_line_1} = undef;
3914 $fhs[$index]->{last_line_2} = undef;
3915 }
3916 }
3917 }
3918 }
3919
3920 ### If there were several equally good alignments for the best alignment score we will boot the read
3921 if ($amb_same_thread){
3922 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n";
3923 $alignment_ambiguous = 1;
3924 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n";
3925 }
3926 else{
3927 # warn "alignment won't be considered ambiguous. This time....\n";
3928 }
3929
3930
3931 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
3932 if ($alignment_ambiguous == 1){
3933 $counting{unsuitable_sequence_count}++;
3934 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
3935 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
3936 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
3937 # print "$ambiguous_read_1\n";
3938 # print "$ambiguous_read_2\n";
3939
3940 if ($ambiguous){
3941 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
3942 }
3943 elsif ($unmapped){
3944 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
3945 }
3946 else{
3947 return 0;
3948 }
3949 }
3950
3951 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
3952 unless (%alignments){
3953 $counting{no_single_alignment_found}++;
3954
3955 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
3956 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
3957 # print "$unmapped_read_1\n";
3958 # print "$unmapped_read_2\n";
3959 if ($unmapped){
3960 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
3961 }
3962 else{
3963 return 0;
3964 }
3965 }
3966
3967 #######################################################################################################################################################
3968
3969 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
3970 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
3971 ### alignment score we are discarding the sequence pair altogether.
3972 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
3973 ### and extending (3 per bp) the gap.
3974
3975 #######################################################################################################################################################
3976
3977 ### Declaring an empty hash reference which will store all information we need for the methylation call
3978 my $methylation_call_params; # hash reference
3979 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
3980
3981 ### print contents of %alignments for debugging
3982 ## if (scalar keys %alignments >= 1){
3983 # print "\n******\n";
3984 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
3985 # print "Loc: $alignment_location\n";
3986 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
3987 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
3988 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
3989 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
3990 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
3991 # print "Index $alignments{$alignment_location}->{index}\n";
3992 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
3993 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
3994 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
3995 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
3996 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
3997 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
3998 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
3999 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
4000 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
4001 # }
4002 # print "\n******\n";
4003 # }
4004
4005 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
4006 if (scalar keys %alignments == 1){
4007 for my $unique_best_alignment (keys %alignments){
4008 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
4009 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
4010 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
4011 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
4012 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
4013 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
4014 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
4015 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
4016 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
4017 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores_second_best};
4018 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
4019 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
4020 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
4021 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
4022 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
4023 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
4024 }
4025 }
4026
4027 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
4028 ### we boot the sequence pair altogether)
4029 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
4030 my $best_sum_of_alignment_scores;
4031 my $best_alignment_location;
4032 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
4033
4034 # warn "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; sleep(1);
4035
4036 unless (defined $best_sum_of_alignment_scores){
4037 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
4038 $best_alignment_location = $alignment_location;
4039 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
4040 }
4041 else{
4042 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
4043 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
4044 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
4045 $sequence_pair_fails = 1;
4046 last; # exiting since we know that the sequence has ambiguous alignments
4047 }
4048 ### else we are going to store the best alignment for further processing
4049 else{
4050 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
4051 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
4052 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
4053 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
4054 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
4055 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
4056 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
4057 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
4058 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
4059 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
4060 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
4061 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
4062 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
4063 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
4064 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
4065
4066 if (defined $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} and ( $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} > $alignments{$alignment_location}->{sum_of_alignment_scores} )) {
4067 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best};
4068 }
4069 else {
4070 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$alignment_location}->{sum_of_alignment_scores};
4071 }
4072
4073 last; # exiting since the sequence produced a unique best alignment
4074 }
4075 }
4076 }
4077 }
4078 else{
4079 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
4080 }
4081
4082 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
4083 if ($sequence_pair_fails == 1){
4084 $counting{unsuitable_sequence_count}++;
4085
4086 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
4087 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
4088 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
4089 # warn "$ambiguous_read_1\n";
4090 # warn "$ambiguous_read_2\n";
4091
4092 if ($ambiguous){
4093 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
4094 }
4095 elsif ($unmapped){
4096 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
4097 }
4098 else{
4099 return 0; # => exits to next sequence pair (default)
4100 }
4101 }
4102
4103 ### --DIRECTIONAL
4104 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
4105 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
4106 if ($directional){
4107 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
4108 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
4109 $counting{alignments_rejected_count}++;
4110 return 0;
4111 }
4112 }
4113
4114 ### If the sequence pair has not been rejected so far it does have a unique best alignment
4115 $counting{unique_best_alignment_count}++;
4116 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
4117
4118 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
4119 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
4120 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n";
4121 $counting{genomic_sequence_could_not_be_extracted_count}++;
4122 return 0;
4123 }
4124 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
4125 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n";
4126 $counting{genomic_sequence_could_not_be_extracted_count}++;
4127 return 0;
4128 }
4129
4130 ### Compute MAPQ value
4131 $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence_1), length($sequence_2),
4132 $methylation_call_params->{$identifier}->{sum_of_alignment_scores},
4133 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best});
4134
4135
4136 ### now we are set to perform the actual methylation call
4137 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
4138 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
4139 # warn "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
4140 # warn " $sequence_2\n";
4141 # warn "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
4142 # warn " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
4143
4144 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
4145 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
4146 }
4147
4148 ###
4149
4150 # Compute MAPQ value for a read or read pair as in Bowtie2-2.2.2 (specifically, V2 of the MAPQ calculator: "class BowtieMapq2")
4151 # assuming end-to-end alignment with the default calculation of the minimum alignment score
4152
4153 sub calc_mapq {
4154 my ($read1Len, $read2Len, $AS_best, $AS_secBest) = @_;
4155
4156 my $scMin = $score_min_intercept + $score_min_slope * $read1Len;
4157 ### read2Len is only defined for paired-end reads, so for single-end mode we can just a score min value for read 1
4158 if (defined $read2Len){
4159 $scMin += $score_min_intercept + $score_min_slope * $read2Len;
4160 }
4161
4162 my $diff = abs$scMin; # scores can vary by up to this much (since max AS is 0 for end-to-end alignment)
4163 my $bestOver = $AS_best - $scMin;
4164
4165 if (!defined $AS_secBest) {
4166 if ($bestOver >= $diff * 0.8) { return 42; }
4167 elsif ($bestOver >= $diff * 0.7) { return 40; }
4168 elsif ($bestOver >= $diff * 0.6) { return 24; }
4169 elsif ($bestOver >= $diff * 0.5) { return 23; }
4170 elsif ($bestOver >= $diff * 0.4) { return 8; }
4171 elsif ($bestOver >= $diff * 0.3) { return 3; }
4172 else { return 0; }
4173 } else {
4174 my $bestDiff = abs(abs($AS_best) - abs($AS_secBest));
4175 if ($bestDiff >= $diff * 0.9) {
4176 if ($bestOver == $diff) {
4177 return 39;
4178 } else {
4179 return 33;
4180 }
4181 } elsif ($bestDiff >= $diff * 0.8) {
4182 if ($bestOver == $diff) {
4183 return 38;
4184 } else {
4185 return 27;
4186 }
4187 } elsif ($bestDiff >= $diff * 0.7) {
4188 if ($bestOver == $diff) {
4189 return 37;
4190 } else {
4191 return 26;
4192 }
4193 } elsif ($bestDiff >= $diff * 0.6) {
4194 if ($bestOver == $diff) {
4195 return 36;
4196 } else {
4197 return 22;
4198 }
4199 } elsif ($bestDiff >= $diff * 0.5) {
4200 if ($bestOver == $diff) {
4201 return 35;
4202 } elsif ($bestOver >= $diff * 0.84) {
4203 return 25;
4204 } elsif ($bestOver >= $diff * 0.68) {
4205 return 16;
4206 } else {
4207 return 5;
4208 }
4209 } elsif ($bestDiff >= $diff * 0.4) {
4210 if ($bestOver == $diff) {
4211 return 34;
4212 } elsif ($bestOver >= $diff * 0.84) {
4213 return 21;
4214 } elsif ($bestOver >= $diff * 0.68) {
4215 return 14;
4216 } else {
4217 return 4;
4218 }
4219 } elsif ($bestDiff >= $diff * 0.3) {
4220 if ($bestOver == $diff) {
4221 return 32;
4222 } elsif ($bestOver >= $diff * 0.88) {
4223 return 18;
4224 } elsif ($bestOver >= $diff * 0.67) {
4225 return 15;
4226 } else {
4227 return 3;
4228 }
4229 } elsif ($bestDiff >= $diff * 0.2) {
4230 if ($bestOver == $diff) {
4231 return 31;
4232 } elsif ($bestOver >= $diff * 0.88) {
4233 return 17;
4234 } elsif ($bestOver >= $diff * 0.67) {
4235 return 11;
4236 } else {
4237 return 0;
4238 }
4239 } elsif ($bestDiff >= $diff * 0.1) {
4240 if ($bestOver == $diff) {
4241 return 30;
4242 } elsif ($bestOver >= $diff * 0.88) {
4243 return 12;
4244 } elsif ($bestOver >= $diff * 0.67) {
4245 return 7;
4246 } else {
4247 return 0;
4248 }
4249 } elsif ($bestDiff > 0) {
4250 if ($bestOver >= $diff * 0.67) {
4251 return 6;
4252 } else {
4253 return 2;
4254 }
4255 } else {
4256 if ($bestOver >= $diff * 0.67) {
4257 return 1;
4258 } else {
4259 return 0;
4260 }
4261 }
4262 }
4263 }
4264
4265
4266 ###
4267
4268 sub decide_whether_paired_end_alignment_is_valid{
4269 my ($index,$identifier) = @_;
4270 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
4271 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
4272 chomp $mismatch_info_1;
4273 chomp $mismatch_info_2;
4274 my $seq_id_1 = $id_1;
4275 my $seq_id_2 = $id_2;
4276 $seq_id_1 =~ s/\/1$//; # removing the read /1
4277 $seq_id_2 =~ s/\/1$//; # removing the read /1
4278
4279 ### ensuring that the current entry is the correct sequence
4280 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
4281 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
4282 ### sensible alignments
4283 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
4284 ### If the orientation was correct can we move on
4285 if ($orientation == 1){
4286 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
4287 }
4288 ### If the alignment was in the wrong orientation we need to read in two new lines
4289 elsif($orientation == 0){
4290 my $newline_1 = $fhs[$index]->{fh}->getline();
4291 my $newline_2 = $fhs[$index]->{fh}->getline();
4292 if ($newline_1 and $newline_2){
4293 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
4294 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
4295 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
4296
4297 my $seqid;
4298 $seq_id_1 = $id_1;
4299 $seq_id_2 = $id_2;
4300 # we need to capture the first read (ending on /1)
4301 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
4302 $seqid = $seq_id_1;
4303 }
4304 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
4305 $seqid = $seq_id_2;
4306 }
4307 else{
4308 die "One of the two reads needs to end on /1!!";
4309 }
4310
4311 ### ensuring that the next entry is still the correct sequence
4312 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
4313 ### checking orientation again
4314 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
4315 ### If the orientation was correct can we move on
4316 if ($orientation == 1){
4317 ### Writing the current sequence to last_line_1 and last_line_2
4318 $fhs[$index]->{last_seq_id} = $seqid;
4319 $fhs[$index]->{last_line_1} = $newline_1;
4320 $fhs[$index]->{last_line_2} = $newline_2;
4321 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
4322 }
4323 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
4324 ### the next entry)
4325 elsif ($orientation == 0){
4326 $newline_1 = $fhs[$index]->{fh}->getline();
4327 $newline_2 = $fhs[$index]->{fh}->getline();
4328 if ($newline_1 and $newline_2){
4329 ($seq_id_1) = split (/\t/,$newline_1);
4330 ($seq_id_2) = split (/\t/,$newline_2);
4331
4332 $seqid = '';
4333 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
4334 $seqid = $seq_id_1;
4335 }
4336 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
4337 $seqid = $seq_id_2;
4338 }
4339 else{
4340 die "One of the two reads needs to end on /1!!";
4341 }
4342
4343 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
4344 ### the same fields of the just read next entry
4345 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
4346 $fhs[$index]->{last_seq_id} = $seqid;
4347 $fhs[$index]->{last_line_1} = $newline_1;
4348 $fhs[$index]->{last_line_2} = $newline_2;
4349 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
4350 }
4351 else {
4352 ### assigning undef to last_seq_id and last_line (end of bowtie output)
4353 $fhs[$index]->{last_seq_id} = undef;
4354 $fhs[$index]->{last_line_1} = undef;
4355 $fhs[$index]->{last_line_2} = undef;
4356 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
4357 }
4358 }
4359 else{
4360 die "The orientation of the alignment must be either correct or incorrect\n";
4361 }
4362 }
4363 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
4364 else{
4365 $fhs[$index]->{last_seq_id} = $seqid;
4366 $fhs[$index]->{last_line_1} = $newline_1;
4367 $fhs[$index]->{last_line_2} = $newline_2;
4368 return 0; # processing the new alignment result only in the next round
4369 }
4370 }
4371 else {
4372 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
4373 $fhs[$index]->{last_seq_id} = undef;
4374 $fhs[$index]->{last_line_1} = undef;
4375 $fhs[$index]->{last_line_2} = undef;
4376 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
4377 }
4378 }
4379 else{
4380 die "The orientation of the alignment must be either correct or incorrect\n";
4381 }
4382 }
4383 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
4384 else{
4385 return 0;
4386 }
4387 }
4388
4389 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
4390
4391 sub extract_corresponding_genomic_sequence_paired_ends {
4392 my ($sequence_identifier,$methylation_call_params) = @_;
4393 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
4394 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
4395 my $alignment_read_1;
4396 my $alignment_read_2;
4397 my $read_conversion_info_1;
4398 my $read_conversion_info_2;
4399 my $genome_conversion;
4400
4401 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
4402 ### if the C happens to be at the first or last position of the actually observed sequence
4403 my $non_bisulfite_sequence_1;
4404 my $non_bisulfite_sequence_2;
4405
4406 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
4407 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
4408 ### sequences around!
4409 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
4410 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
4411 ### [Index 0, sequence originated from (converted) forward strand]
4412 $counting{CT_GA_CT_count}++;
4413 $alignment_read_1 = '+';
4414 $alignment_read_2 = '-';
4415 $read_conversion_info_1 = 'CT';
4416 $read_conversion_info_2 = 'GA';
4417 $genome_conversion = 'CT';
4418 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
4419 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
4420
4421 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
4422
4423 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
4424 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
4425 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
4426
4427 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
4428 ### the reverse strand sequence needs to be reverse complemented
4429 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
4430 }
4431 else{
4432 $non_bisulfite_sequence_2 = '';
4433 }
4434 }
4435
4436 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
4437 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
4438 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
4439 $counting{GA_CT_GA_count}++;
4440 $alignment_read_1 = '+';
4441 $alignment_read_2 = '-';
4442 $read_conversion_info_1 = 'GA';
4443 $read_conversion_info_2 = 'CT';
4444 $genome_conversion = 'GA';
4445
4446 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
4447 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
4448 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
4449 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
4450 }
4451 else{
4452 $non_bisulfite_sequence_1 = '';
4453 }
4454
4455 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
4456 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
4457 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
4458 ### the reverse strand sequence needs to be reverse complemented
4459 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
4460 }
4461
4462 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
4463 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
4464 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
4465 $counting{GA_CT_CT_count}++;
4466 $alignment_read_1 = '-';
4467 $alignment_read_2 = '+';
4468 $read_conversion_info_1 = 'GA';
4469 $read_conversion_info_2 = 'CT';
4470 $genome_conversion = 'CT';
4471
4472 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
4473 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
4474 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
4475 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
4476 ### the reverse strand sequence needs to be reverse complemented
4477 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
4478
4479 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
4480 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
4481 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
4482 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
4483 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
4484 }
4485 else{
4486 $non_bisulfite_sequence_2 = '';
4487 }
4488 }
4489
4490 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
4491 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
4492 ### [Index 3, sequence originated from the (converted) reverse strand]
4493 $counting{CT_GA_GA_count}++;
4494 $alignment_read_1 = '-';
4495 $alignment_read_2 = '+';
4496 $read_conversion_info_1 = 'CT';
4497 $read_conversion_info_2 = 'GA';
4498 $genome_conversion = 'GA';
4499
4500 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
4501 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
4502 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
4503 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
4504 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
4505 ### the reverse strand sequence needs to be reverse complemented
4506 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
4507 }
4508 else{
4509 $non_bisulfite_sequence_1 = '';
4510 }
4511
4512 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
4513 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
4514 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
4515 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
4516 }
4517 else{
4518 die "Too many bowtie result filehandles\n";
4519 }
4520 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
4521 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
4522
4523 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
4524 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
4525 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
4526 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
4527 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
4528 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
4529 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
4530 }
4531
4532 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
4533
4534 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
4535 my ($sequence_identifier,$methylation_call_params) = @_;
4536 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
4537 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
4538
4539 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
4540 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
4541 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
4542 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
4543
4544 my $contains_deletion_1 = 0;
4545 my $contains_deletion_2 = 0;
4546 if ($cigar_1 =~ /D/){
4547 $contains_deletion_1 = 1;
4548 if ($verbose){ warn "$cigar_1\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_1}\n";}
4549 }
4550 if ($cigar_2 =~ /D/){
4551 $contains_deletion_2 = 1;
4552 if ($verbose){ warn "$cigar_2\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_2}\n";}
4553 }
4554
4555 # warn "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
4556 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
4557 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
4558
4559 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
4560 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
4561 my $alignment_read_1;
4562 my $alignment_read_2;
4563 my $read_conversion_info_1;
4564 my $read_conversion_info_2;
4565 my $genome_conversion;
4566
4567 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
4568 ### if the C happens to be at the last position of the actually observed sequence
4569 my $non_bisulfite_sequence_1 = '';
4570 my $non_bisulfite_sequence_2 = '';
4571 my $genomic_seq_for_MD_tag_1 = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output
4572 my $genomic_seq_for_MD_tag_2 = '';
4573
4574 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
4575 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
4576 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
4577
4578 # parsing CIGAR 1 string
4579 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
4580 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
4581 shift @ops_1; # remove the empty first element
4582 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
4583 # parsing CIGAR 2 string
4584 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
4585 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
4586 shift @ops_2; # remove the empty first element
4587 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
4588
4589 my $indels_1 = 0; # adding these to the hemming distance value (needed for the NM field in the final SAM output
4590 my $indels_2 = 0;
4591
4592 ### Extracting read 1 genomic sequence ###
4593
4594 # extracting 2 additional bp at the 5' end (read 1)
4595 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
4596 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
4597 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
4598 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
4599 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1;
4600 return;
4601 }
4602 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
4603 }
4604
4605 foreach (0..$#len_1){
4606 if ($ops_1[$_] eq 'M'){
4607 # extracting genomic sequence
4608 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
4609 if ($contains_deletion_1){
4610 $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
4611 }
4612 # warn "$non_bisulfite_sequence_1\n";
4613 # adjusting position
4614 $pos_1 += $len_1[$_];
4615 }
4616 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
4617 # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls, and we can later ignore it for the generation of the MD;Z: tag
4618 $non_bisulfite_sequence_1 .= 'X' x $len_1[$_];
4619 if ($contains_deletion_1){
4620 $genomic_seq_for_MD_tag_1 .= 'X' x $len_1[$_];
4621 }
4622 # warn "$non_bisulfite_sequence_1\n";
4623 # position doesn't need adjusting
4624
4625 ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist()
4626 # indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
4627 }
4628 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
4629 # we do not add any genomic sequence but only adjust the position
4630 # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later
4631 if ($contains_deletion_1){
4632 $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
4633 }
4634 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
4635 $pos_1 += $len_1[$_];
4636 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
4637 }
4638 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
4639 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
4640 }
4641 else{
4642 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
4643 }
4644 }
4645
4646 ### 3' end of read 1
4647 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
4648 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
4649 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
4650 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
4651 return;
4652 }
4653
4654 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
4655 }
4656
4657
4658 ### Extracting read 2 genomic sequence ###
4659
4660 ### 5' end of read 2
4661 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
4662 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
4663 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
4664 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
4665 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
4666 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2;
4667 return;
4668 }
4669 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
4670 }
4671
4672 foreach (0..$#len_2){
4673 if ($ops_2[$_] eq 'M'){
4674 # extracting genomic sequence
4675 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
4676 if ($contains_deletion_2){
4677 $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
4678 }
4679 # warn "$non_bisulfite_sequence_2\n";
4680 # adjusting position
4681 $pos_2 += $len_2[$_];
4682 }
4683 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
4684 # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can ignore this later during the generation of the MD:Z: tag
4685 $non_bisulfite_sequence_2 .= 'X' x $len_2[$_];
4686 if ($contains_deletion_2){
4687 $genomic_seq_for_MD_tag_2 .= 'X' x $len_2[$_];
4688 }
4689 # warn "$non_bisulfite_sequence_2\n";
4690 # position doesn't need adjusting
4691
4692 ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist()
4693 # $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
4694 }
4695 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
4696 # we do not add any genomic sequence but only adjust the position
4697 # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later
4698 if ($contains_deletion_2){
4699 $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
4700 }
4701 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
4702 $pos_2 += $len_2[$_];
4703 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
4704 }
4705 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
4706 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
4707 }
4708 else{
4709 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
4710 }
4711 }
4712
4713 ### 3' end of read 2
4714 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
4715 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
4716 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
4717 # need to set read 1 as well now to prevent warning
4718 # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n";
4719 # sleep(5);
4720 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
4721 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
4722 return;
4723 }
4724 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
4725 }
4726
4727 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
4728 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
4729
4730 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
4731 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
4732 ### [Index 0, sequence originated from (converted) forward strand]
4733 $counting{CT_GA_CT_count}++;
4734 $alignment_read_1 = '+';
4735 $alignment_read_2 = '-';
4736 $read_conversion_info_1 = 'CT';
4737 $read_conversion_info_2 = 'GA';
4738 $genome_conversion = 'CT';
4739 ### Read 1 is always the forward hit
4740 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
4741 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
4742 if ($contains_deletion_2){
4743 $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2);
4744 }
4745 }
4746
4747 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
4748 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
4749 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
4750 $counting{GA_CT_GA_count}++;
4751 $alignment_read_1 = '+';
4752 $alignment_read_2 = '-';
4753 $read_conversion_info_1 = 'GA';
4754 $read_conversion_info_2 = 'CT';
4755 $genome_conversion = 'GA';
4756 ### Read 1 is always the forward hit
4757 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
4758 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
4759 if ($contains_deletion_2){
4760 $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2);
4761 }
4762 }
4763
4764 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
4765 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
4766 ### [Index 2, sequence originated from the complementary to (converted) top strand]
4767 $counting{GA_CT_CT_count}++;
4768 $alignment_read_1 = '-';
4769 $alignment_read_2 = '+';
4770 $read_conversion_info_1 = 'GA';
4771 $read_conversion_info_2 = 'CT';
4772 $genome_conversion = 'CT';
4773
4774 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
4775 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
4776 if ($contains_deletion_1){
4777 $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1);
4778 }
4779 }
4780
4781 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
4782 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
4783 ### [Index 3, sequence originated from the (converted) reverse strand]
4784 $counting{CT_GA_GA_count}++;
4785 $alignment_read_1 = '-';
4786 $alignment_read_2 = '+';
4787 $read_conversion_info_1 = 'CT';
4788 $read_conversion_info_2 = 'GA';
4789 $genome_conversion = 'GA';
4790 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
4791 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
4792 if ($contains_deletion_1){
4793 $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1);
4794 }
4795 }
4796 else{
4797 die "Too many bowtie result filehandles\n";
4798 }
4799 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
4800 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
4801
4802 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
4803 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
4804 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
4805 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
4806 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
4807 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
4808 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
4809 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1;
4810 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2;
4811
4812 ## the end position of a read is stored in $pos
4813 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
4814 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
4815 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
4816 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
4817 }
4818
4819 ##########################################
4820 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
4821 ##########################################
4822
4823 sub print_bisulfite_mapping_result_single_end{
4824 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
4825
4826 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
4827 if ($phred64){
4828 $quality_value = convert_phred64_quals_to_phred33($quality_value);
4829 }
4830 elsif ($solexa){
4831 $quality_value = convert_solexa_quals_to_phred33($quality_value);
4832 }
4833
4834 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
4835 $methylation_call_params->{$identifier}->{position} += 1;
4836
4837 ### writing every uniquely mapped read and its methylation call to the output file
4838 if ($vanilla){
4839 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
4840 print OUT "$bowtie1_output\n";
4841 }
4842 else{ # SAM output, default since Bismark v1.0.0
4843 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
4844 }
4845 }
4846
4847 ##########################################
4848 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
4849 ##########################################
4850
4851 sub print_bisulfite_mapping_result_single_end_bowtie2{
4852 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
4853
4854 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
4855 if ($phred64){
4856 $quality_value = convert_phred64_quals_to_phred33($quality_value);
4857 }
4858 elsif ($solexa){
4859 $quality_value = convert_solexa_quals_to_phred33($quality_value);
4860 }
4861
4862 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
4863 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
4864 }
4865
4866 ##########################################
4867 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
4868 ##########################################
4869
4870 sub print_bisulfite_mapping_results_paired_ends{
4871 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
4872
4873 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
4874 if ($phred64){
4875 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
4876 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
4877 }
4878 elsif ($solexa){
4879 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
4880 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
4881 }
4882
4883 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
4884 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
4885
4886 ### writing every single aligned read and its methylation call to the output file
4887 if ($vanilla){
4888 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
4889 print OUT "$bowtie1_output_paired_end\n";
4890 }
4891 else{ # SAM output, default since Bismark v1.0.0
4892 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
4893 }
4894
4895 }
4896
4897 ##########################################
4898 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
4899 ##########################################
4900
4901 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
4902 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
4903
4904 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
4905 if ($phred64){
4906 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
4907 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
4908 }
4909 elsif ($solexa){
4910 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
4911 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
4912 }
4913
4914 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
4915 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
4916
4917 }
4918
4919
4920 sub convert_phred64_quals_to_phred33{
4921
4922 my $qual = shift;
4923 my @quals = split (//,$qual);
4924 my @new_quals;
4925
4926 foreach my $index (0..$#quals){
4927 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
4928 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
4929 $new_quals[$index] = $phred33_quality_string;
4930 }
4931
4932 my $phred33_quality = join ("",@new_quals);
4933 return $phred33_quality;
4934 }
4935
4936 sub convert_solexa_quals_to_phred33{
4937
4938 my $qual = shift;
4939 my @quals = split (//,$qual);
4940 my @new_quals;
4941
4942 foreach my $index (0..$#quals){
4943 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
4944 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
4945 $new_quals[$index] = $phred33_quality_string;
4946 }
4947
4948 my $phred33_quality = join ("",@new_quals);
4949 return $phred33_quality;
4950 }
4951
4952 sub convert_phred_score_into_phred33_quality_string{
4953 my $qual = shift;
4954 $qual = chr($qual+33);
4955 return $qual;
4956 }
4957
4958 sub convert_phred64_quality_string_into_phred_score{
4959 my $string = shift;
4960 my $qual = ord($string)-64;
4961 return $qual;
4962 }
4963
4964 sub convert_solexa_pre1_3_quality_string_into_phred_score{
4965 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
4966 my $string = shift;
4967 my $qual = ord($string)-59;
4968 return $qual;
4969 }
4970
4971
4972 sub extract_corresponding_genomic_sequence_single_end {
4973 my ($sequence_identifier,$methylation_call_params) = @_;
4974 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
4975 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
4976
4977 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
4978 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
4979 my $alignment_strand;
4980 my $read_conversion_info;
4981 my $genome_conversion;
4982 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
4983 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
4984 ### if the C happens to be at the last position of the actually observed sequence
4985 my $non_bisulfite_sequence;
4986 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
4987
4988 my $pbat_index_modifier = 0;
4989
4990 if ($pbat){
4991 $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1!
4992 }
4993
4994 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
4995 if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){
4996 ### [Index 0, sequence originated from (converted) forward strand]
4997 $counting{CT_CT_count}++;
4998 $alignment_strand = '+';
4999 $read_conversion_info = 'CT';
5000 $genome_conversion = 'CT';
5001
5002 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
5003 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
5004 ### + 2 extra base at the 3' end
5005 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
5006 }
5007 else{
5008 $non_bisulfite_sequence = '';
5009 }
5010 }
5011
5012 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
5013 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){
5014 ### [Index 1, sequence originated from (converted) reverse strand]
5015 $counting{CT_GA_count}++;
5016 $alignment_strand = '-';
5017 $read_conversion_info = 'CT';
5018 $genome_conversion = 'GA';
5019
5020 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
5021 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
5022 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
5023 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
5024 ## reverse complement!
5025 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
5026 }
5027 else{
5028 $non_bisulfite_sequence = '';
5029 }
5030 }
5031
5032 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
5033 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){
5034 ### [Index 2, sequence originated from complementary to (converted) forward strand]
5035 $counting{GA_CT_count}++;
5036 $alignment_strand = '-';
5037 $read_conversion_info = 'GA';
5038 $genome_conversion = 'CT';
5039
5040 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
5041 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
5042 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
5043 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
5044 ## reverse complement!
5045 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
5046 }
5047 else{
5048 $non_bisulfite_sequence = '';
5049 }
5050 }
5051
5052 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
5053 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){
5054 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
5055 $counting{GA_GA_count}++;
5056 $alignment_strand = '+';
5057 $read_conversion_info = 'GA';
5058 $genome_conversion = 'GA';
5059
5060 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
5061 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
5062 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
5063 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
5064 }
5065 else{
5066 $non_bisulfite_sequence = '';
5067 }
5068 }
5069 else{
5070 die "Too many bowtie result filehandles\n";
5071 }
5072
5073 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
5074 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
5075 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
5076 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
5077
5078 ### at this point we can also determine the end position of a read
5079 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
5080 }
5081
5082
5083 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
5084 my ($sequence_identifier,$methylation_call_params) = @_;
5085
5086 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{MD_tag};
5087 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
5088
5089 my $contains_deletion = 0;
5090 if ($cigar =~ /D/){
5091 $contains_deletion = 1;
5092 # warn "$cigar\n$MD_tag\n";
5093 }
5094 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
5095 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
5096
5097 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
5098 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
5099 my $alignment_strand;
5100 my $read_conversion_info;
5101 my $genome_conversion;
5102
5103 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
5104 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
5105 my $non_bisulfite_sequence = '';
5106 my $genomic_seq_for_MD_tag = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output
5107
5108 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
5109 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
5110
5111 # parsing CIGAR string
5112 my @len = split (/\D+/,$cigar); # storing the length per operation
5113 my @ops = split (/\d+/,$cigar); # storing the operation
5114 shift @ops; # remove the empty first element
5115 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
5116
5117 my $pbat_index_modifier = 0;
5118
5119 if ($pbat){
5120 $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1!
5121 }
5122
5123 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
5124 if ( (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1) or (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3) ){
5125 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
5126 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
5127 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
5128 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag;
5129 return;
5130 }
5131 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
5132 }
5133
5134 my $indels = 0;
5135
5136 foreach (0..$#len){
5137 if ($ops[$_] eq 'M'){
5138 #extracting genomic sequence
5139 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
5140 if ($contains_deletion){
5141 $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
5142 }
5143 # adjusting position
5144 $pos += $len[$_];
5145 }
5146 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
5147 # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can later ignore it better during the generation of the MD:Z-tag
5148 $non_bisulfite_sequence .= 'X' x $len[$_];
5149 if ($contains_deletion){
5150 $genomic_seq_for_MD_tag .= 'X' x $len[$_];
5151 }
5152 # warn "$non_bisulfite_sequence\n";
5153 # position doesn't need to be adjusting
5154
5155 ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail the base by base comparison in hemming_dist()
5156 # $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
5157 }
5158 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
5159 # we do not add any genomic sequence but only adjust the position
5160
5161 # we do however add the genomic sequence to the $genomic_sequence for MD-tag determination if the CIGAR string contained a deletion
5162 if ($contains_deletion){
5163 $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
5164 }
5165 $pos += $len[$_];
5166 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
5167 }
5168 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
5169 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
5170 }
5171 else{
5172 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
5173 }
5174 }
5175
5176 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
5177 if ( ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0) or ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2) ){
5178 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
5179 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
5180 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
5181 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag;
5182 return;
5183 }
5184 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
5185 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
5186 }
5187
5188
5189 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
5190 if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){
5191 ### [Index 0, sequence originated from (converted) forward strand]
5192 $counting{CT_CT_count}++;
5193 $alignment_strand = '+';
5194 $read_conversion_info = 'CT';
5195 $genome_conversion = 'CT';
5196 }
5197
5198 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
5199 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){
5200 ### [Index 1, sequence originated from (converted) reverse strand]
5201 $counting{CT_GA_count}++;
5202 $alignment_strand = '-';
5203 $read_conversion_info = 'CT';
5204 $genome_conversion = 'GA';
5205
5206 ### reverse complement!
5207 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
5208 if ($contains_deletion){
5209 $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag);
5210 }
5211 }
5212
5213 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
5214 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){
5215 ### [Index 2, sequence originated from complementary to (converted) forward strand]
5216 $counting{GA_CT_count}++;
5217 $alignment_strand = '-';
5218 $read_conversion_info = 'GA';
5219 $genome_conversion = 'CT';
5220
5221 ### reverse complement!
5222 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
5223 if ($contains_deletion){
5224 $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag);
5225 }
5226 }
5227
5228 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
5229 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){
5230 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
5231 $counting{GA_GA_count}++;
5232 $alignment_strand = '+';
5233 $read_conversion_info = 'GA';
5234 $genome_conversion = 'GA';
5235
5236 }
5237 else{
5238 die "Too many Bowtie 2 result filehandles\n";
5239 }
5240
5241 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
5242 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
5243 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
5244 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
5245 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag;
5246
5247 # if ($contains_deletion){
5248 # warn "non-bis: $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence}\n";
5249 # warn "MD-seq: $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag}\n";
5250 # }
5251
5252 ### the end position of a read is stored in $pos
5253 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
5254 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
5255 }
5256
5257 ### METHYLATION CALL
5258
5259 sub methylation_call{
5260 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
5261 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
5262 my @seq = split(//,$sequence_actually_observed);
5263 my @genomic = split(//,$genomic_sequence);
5264 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
5265 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
5266 ### CpG, CHH or CHG context
5267
5268 #################################################################
5269 ### . for bases not involving cytosines ###
5270 ### X for methylated C in CHG context (was protected) ###
5271 ### x for not methylated C in CHG context (was converted) ###
5272 ### H for methylated C in CHH context (was protected) ###
5273 ### h for not methylated C in CHH context (was converted) ###
5274 ### Z for methylated C in CpG context (was protected) ###
5275 ### z for not methylated C in CpG context (was converted) ###
5276 ### U for methylated C in unknown context (was protected) ###
5277 ### u for not methylated C in unknwon context (was converted) ###
5278 #################################################################
5279
5280 my @match =();
5281 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
5282 my $methyl_CHH_count = 0;
5283 my $methyl_CHG_count = 0;
5284 my $methyl_CpG_count = 0;
5285 my $methyl_C_unknown_count = 0;
5286 my $unmethylated_CHH_count = 0;
5287 my $unmethylated_CHG_count = 0;
5288 my $unmethylated_CpG_count = 0;
5289 my $unmethylated_C_unknown_count = 0;
5290
5291 if ($read_conversion eq 'CT'){
5292 for my $index (0..$#seq) {
5293 if ($seq[$index] eq $genomic[$index]) {
5294 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
5295 if ($genomic[$index] eq 'C') {
5296 ### If the residue is a C we want to know if it was in CpG context or in any other context
5297 my $downstream_base = $genomic[$index+1];
5298
5299 if ($downstream_base eq 'G'){
5300 ++$methyl_CpG_count;
5301 push @match,'Z'; # protected C, methylated, in CpG context
5302 }
5303 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
5304 ++$methyl_C_unknown_count;
5305 push @match,'U'; # protected C, methylated, in Unknown context
5306 }
5307 else {
5308 ### C in not in CpG-context, determining the second downstream base context
5309 my $second_downstream_base = $genomic[$index+2];
5310
5311 if ($second_downstream_base eq 'G'){
5312 ++$methyl_CHG_count;
5313 push @match,'X'; # protected C, methylated, in CHG context
5314 }
5315 elsif ($second_downstream_base eq 'N'){
5316 ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
5317 push @match,'U'; # protected C, methylated, in Unknown context
5318 }
5319 else{
5320 ++$methyl_CHH_count;
5321 push @match,'H'; # protected C, methylated, in CHH context
5322 }
5323 }
5324 }
5325 else {
5326 push @match, '.';
5327 }
5328 }
5329 elsif ($seq[$index] ne $genomic[$index]) {
5330 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
5331 ### in the actually observed sequence
5332 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
5333 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
5334 my $downstream_base = $genomic[$index+1];
5335
5336 if ($downstream_base eq 'G'){
5337 ++$unmethylated_CpG_count;
5338 push @match,'z'; # converted C, not methylated, in CpG context
5339 }
5340 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
5341 ++$unmethylated_C_unknown_count;
5342 push @match,'u'; # converted C, not methylated, in Unknown context
5343 }
5344 else{
5345 ### C in not in CpG-context, determining the second downstream base context
5346 my $second_downstream_base = $genomic[$index+2];
5347
5348 if ($second_downstream_base eq 'G'){
5349 ++$unmethylated_CHG_count;
5350 push @match,'x'; # converted C, not methylated, in CHG context
5351 }
5352 elsif ($second_downstream_base eq 'N'){
5353 ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
5354 push @match,'u'; # converted C, not methylated, in Unknown context
5355 }
5356 else{
5357 ++$unmethylated_CHH_count;
5358 push @match,'h'; # converted C, not methylated, in CHH context
5359 }
5360 }
5361 }
5362 ### all other mismatches are not of interest for a methylation call
5363 else {
5364 push @match,'.';
5365 }
5366 }
5367 else{
5368 die "There can be only 2 possibilities\n";
5369 }
5370 }
5371 }
5372 elsif ($read_conversion eq 'GA'){
5373 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
5374
5375 for my $index (0..$#seq) {
5376 if ($seq[$index] eq $genomic[$index+2]) {
5377 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
5378 if ($genomic[$index+2] eq 'G') {
5379 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
5380 ### to look if the base upstream is a C
5381
5382 my $upstream_base = $genomic[$index+1];
5383
5384 if ($upstream_base eq 'C'){
5385 ++$methyl_CpG_count;
5386 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
5387 }
5388 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
5389 ++$methyl_C_unknown_count;
5390 push @match,'U'; # protected C on opposing strand, methylated, in Unknown context
5391 }
5392 else{
5393 ### C in not in CpG-context, determining the second upstream base context
5394 my $second_upstream_base = $genomic[$index];
5395
5396 if ($second_upstream_base eq 'C'){
5397 ++$methyl_CHG_count;
5398 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
5399 }
5400 elsif ($second_upstream_base eq 'N'){
5401 ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
5402 push @match,'U'; # protected C, methylated, in Unknown context
5403 }
5404 else{
5405 ++$methyl_CHH_count;
5406 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
5407 }
5408 }
5409 }
5410 else{
5411 push @match, '.';
5412 }
5413 }
5414 elsif ($seq[$index] ne $genomic[$index+2]) {
5415 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
5416 ### on the opposing strand, so G to A conversions in the actually observed sequence
5417 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
5418 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
5419 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
5420
5421 my $upstream_base = $genomic[$index+1];
5422
5423 if ($upstream_base eq 'C'){
5424 ++$unmethylated_CpG_count;
5425 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
5426 }
5427 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG)
5428 ++$unmethylated_C_unknown_count;
5429 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
5430 }
5431 else{
5432 ### C in not in CpG-context, determining the second upstream base context
5433 my $second_upstream_base = $genomic[$index];
5434
5435 if ($second_upstream_base eq 'C'){
5436 ++$unmethylated_CHG_count;
5437 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
5438 }
5439 elsif ($second_upstream_base eq 'N'){
5440 ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG)
5441 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context
5442 }
5443 else{
5444 ++$unmethylated_CHH_count;
5445 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
5446 }
5447 }
5448 }
5449 ### all other mismatches are not of interest for a methylation call
5450 else {
5451 push @match,'.';
5452 }
5453 }
5454 else{
5455 die "There can be only 2 possibilities\n";
5456 }
5457 }
5458 }
5459 else{
5460 die "Strand conversion info is required to perform a methylation call\n";
5461 }
5462
5463 my $methylation_call = join ("",@match);
5464
5465 $counting{total_meCHH_count} += $methyl_CHH_count;
5466 $counting{total_meCHG_count} += $methyl_CHG_count;
5467 $counting{total_meCpG_count} += $methyl_CpG_count;
5468 $counting{total_meC_unknown_count} += $methyl_C_unknown_count;
5469 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
5470 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
5471 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
5472 $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count;
5473
5474 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
5475
5476 return $methylation_call;
5477 }
5478
5479 sub read_genome_into_memory{
5480 ## working directoy
5481 my $cwd = shift;
5482 ## reading in and storing the specified genome in the %chromosomes hash
5483 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
5484 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
5485
5486 my @chromosome_filenames = <*.fa>;
5487
5488 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
5489 unless (@chromosome_filenames){
5490 @chromosome_filenames = <*.fasta>;
5491 }
5492
5493 unless (@chromosome_filenames){
5494 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
5495 }
5496
5497 my $SQ_count = 0;
5498
5499 foreach my $chromosome_filename (@chromosome_filenames){
5500
5501 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
5502 ### first line needs to be a fastA header
5503 my $first_line = <CHR_IN>;
5504 chomp $first_line;
5505 $first_line =~ s/\r//;
5506 ### Extracting chromosome name from the FastA header
5507 my $chromosome_name = extract_chromosome_name($first_line);
5508 my $sequence;
5509
5510 while (<CHR_IN>){
5511 chomp;
5512 $_ =~ s/\r//; # removing carriage returns if present
5513 if ($_ =~ /^>/){
5514
5515 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
5516 if (exists $chromosomes{$chromosome_name}){
5517 print "chr $chromosome_name (",length $sequence ," bp)\n";
5518 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
5519 }
5520 else {
5521 if (length($sequence) == 0){
5522 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
5523 }
5524 print "chr $chromosome_name (",length $sequence ," bp)\n";
5525 $chromosomes{$chromosome_name} = $sequence;
5526 $SQ_order{$SQ_count} = $chromosome_name;
5527
5528 ++$SQ_count;
5529
5530 }
5531 ### resetting the sequence variable
5532 $sequence = '';
5533 ### setting new chromosome name
5534 $chromosome_name = extract_chromosome_name($_);
5535 }
5536 else{
5537 $sequence .= uc$_;
5538 }
5539 }
5540
5541 ### Processing last chromosome of a multi Fasta File or the only entry in case of single entry FastA files
5542
5543 if (exists $chromosomes{$chromosome_name}){
5544 print "chr $chromosome_name (",length $sequence ," bp)\t";
5545 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
5546 }
5547 else{
5548 if (length($sequence) == 0){
5549 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
5550 }
5551
5552 ++$SQ_count;
5553
5554 print "chr $chromosome_name (",length $sequence ," bp)\n";
5555 $chromosomes{$chromosome_name} = $sequence;
5556 $SQ_order{$SQ_count} = $chromosome_name;
5557 }
5558 }
5559 print "\n";
5560 chdir $cwd or die "Failed to move to directory $cwd\n";
5561 }
5562
5563 sub extract_chromosome_name {
5564 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
5565 my $fasta_header = shift;
5566 if ($fasta_header =~ s/^>//){
5567 my ($chromosome_name) = split (/\s+/,$fasta_header);
5568 return $chromosome_name;
5569 }
5570 else{
5571 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
5572 }
5573 }
5574
5575 sub reverse_complement{
5576 my $sequence = shift;
5577 $sequence =~ tr/CATG/GTAC/;
5578 $sequence = reverse($sequence);
5579 return $sequence;
5580 }
5581
5582 sub biTransformFastAFiles {
5583 my $file = shift;
5584 my ($dir,$filename);
5585 if ($file =~ /\//){
5586 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
5587 }
5588 else{
5589 $filename = $file;
5590 }
5591
5592 ### gzipped version of the infile
5593 if ($file =~ /\.gz$/){
5594 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
5595 }
5596 else{
5597 open (IN,$file) or die "Couldn't read from file $file: $!\n";
5598 }
5599
5600 if ($skip){
5601 warn "Skipping the first $skip reads from $file\n";
5602 sleep (1);
5603 }
5604 if ($upto){
5605 warn "Processing reads up to sequence no. $upto from $file\n";
5606 sleep (1);
5607 }
5608
5609 my $C_to_T_infile = my $G_to_A_infile = $filename;
5610
5611 if ($gzip){
5612 $C_to_T_infile =~ s/$/_C_to_T.fa.gz/;
5613 $G_to_A_infile =~ s/$/_G_to_A.fa.gz/;
5614 }
5615 else{
5616 $C_to_T_infile =~ s/$/_C_to_T.fa/;
5617 $G_to_A_infile =~ s/$/_G_to_A.fa/;
5618 }
5619
5620 if ($prefix){
5621 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
5622 $C_to_T_infile = "$prefix.$C_to_T_infile";
5623 $G_to_A_infile = "$prefix.$G_to_A_infile";
5624 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
5625 }
5626
5627 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
5628
5629 if ($gzip){
5630 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
5631 }
5632 else{
5633 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
5634 }
5635
5636 unless ($directional){
5637 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
5638 if ($gzip){
5639 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
5640 }
5641 else{
5642 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
5643 }
5644 }
5645
5646 my $count = 0;
5647
5648 while (1){
5649 my $header = <IN>;
5650 my $sequence= <IN>;
5651 last unless ($header and $sequence);
5652
5653 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
5654
5655 ++$count;
5656
5657 if ($skip){
5658 next unless ($count > $skip);
5659 }
5660 if ($upto){
5661 last if ($count > $upto);
5662 }
5663
5664 $sequence = uc$sequence; # make input file case insensitive
5665
5666 # detecting if the input file contains tab stops, as this is likely to result in no alignments
5667 if (index($header,"\t") != -1){
5668 $seqID_contains_tabs++;
5669 }
5670
5671 ### small check if the sequence seems to be in FastA format
5672 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
5673
5674 my $sequence_C_to_T = $sequence;
5675 $sequence_C_to_T =~ tr/C/T/;
5676 print CTOT "$header$sequence_C_to_T";
5677
5678 unless ($directional){
5679 my $sequence_G_to_A = $sequence;
5680 $sequence_G_to_A =~ tr/G/A/;
5681 print GTOA "$header$sequence_G_to_A";
5682 }
5683 }
5684 close CTOT or die "Failed to close filehandle $!\n";
5685
5686 if ($directional){
5687 warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
5688 }
5689 else{
5690 close GTOA or die "Failed to close filehandle $!\n";
5691 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
5692 }
5693 return ($C_to_T_infile,$G_to_A_infile);
5694 }
5695
5696 sub biTransformFastAFiles_paired_end {
5697 my ($file,$read_number) = @_;
5698
5699 if ($gzip){
5700 warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n";
5701 sleep (2);
5702 }
5703
5704 my ($dir,$filename);
5705 if ($file =~ /\//){
5706 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
5707 }
5708 else{
5709 $filename = $file;
5710 }
5711
5712 ### gzipped version of the infile
5713 if ($file =~ /\.gz$/){
5714 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
5715 }
5716 else{
5717 open (IN,$file) or die "Couldn't read from file $file: $!\n";
5718 }
5719
5720 if ($skip){
5721 warn "Skipping the first $skip reads from $file\n";
5722 sleep (1);
5723 }
5724 if ($upto){
5725 warn "Processing reads up to sequence no. $upto from $file\n";
5726 sleep (1);
5727 }
5728
5729 my $C_to_T_infile = my $G_to_A_infile = $filename;
5730
5731 $C_to_T_infile =~ s/$/_C_to_T.fa/;
5732 $G_to_A_infile =~ s/$/_G_to_A.fa/;
5733
5734 if ($prefix){
5735 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
5736 $C_to_T_infile = "$prefix.$C_to_T_infile";
5737 $G_to_A_infile = "$prefix.$G_to_A_infile";
5738 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
5739 }
5740
5741 if ($directional){
5742 if ($read_number == 1){
5743 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
5744 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
5745 }
5746 elsif ($read_number == 2){
5747 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
5748 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
5749 }
5750 else{
5751 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
5752 }
5753 }
5754 else{ # all four strand output
5755 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
5756 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
5757 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
5758 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
5759 }
5760
5761 my $count = 0;
5762
5763 while (1){
5764 my $header = <IN>;
5765 my $sequence= <IN>;
5766 last unless ($header and $sequence);
5767
5768 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
5769
5770 ++$count;
5771
5772 if ($skip){
5773 next unless ($count > $skip);
5774 }
5775 if ($upto){
5776 last if ($count > $upto);
5777 }
5778
5779 $sequence = uc$sequence; # make input file case insensitive
5780
5781 # detecting if the input file contains tab stops, as this is likely to result in no alignments
5782 if (index($header,"\t") != -1){
5783 $seqID_contains_tabs++;
5784 }
5785
5786 ## small check if the sequence seems to be in FastA format
5787 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/);
5788
5789 if ($read_number == 1){
5790 if ($bowtie2){
5791 $header =~ s/$/\/1\/1/;
5792 }
5793 else{
5794 $header =~ s/$/\/1/;
5795 }
5796 }
5797 elsif ($read_number == 2){
5798 if ($bowtie2){
5799 $header =~ s/$/\/2\/2/;
5800 }
5801 else{
5802 $header =~ s/$/\/2/;
5803 }
5804 }
5805 else{
5806 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
5807 }
5808 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
5809
5810 $sequence_C_to_T =~ tr/C/T/;
5811 $sequence_G_to_A =~ tr/G/A/;
5812
5813 if ($directional){
5814
5815 if ($read_number == 1){
5816 print CTOT "$header$sequence_C_to_T";
5817 }
5818 elsif ($read_number == 2){
5819 print GTOA "$header$sequence_G_to_A";
5820 }
5821 }
5822 else{
5823 print CTOT "$header$sequence_C_to_T";
5824 print GTOA "$header$sequence_G_to_A";
5825 }
5826 }
5827
5828 if ($directional){
5829 if ($read_number == 1){
5830 warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
5831 }
5832 else{
5833 warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
5834 }
5835 }
5836 else{
5837 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
5838 }
5839
5840 if ($directional){
5841 if ($read_number == 1){
5842 return ($C_to_T_infile);
5843 }
5844 else{
5845 return ($G_to_A_infile);
5846 }
5847 }
5848 else{
5849 return ($C_to_T_infile,$G_to_A_infile);
5850 }
5851 }
5852
5853
5854 sub biTransformFastQFiles {
5855 my $file = shift;
5856 my ($dir,$filename);
5857 if ($file =~ /\//){
5858 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
5859 }
5860 else{
5861 $filename = $file;
5862 }
5863
5864 ### gzipped version of the infile
5865 if ($file =~ /\.gz$/){
5866 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
5867 }
5868 else{
5869 open (IN,$file) or die "Couldn't read from file $file: $!\n";
5870 }
5871
5872 if ($skip){
5873 warn "Skipping the first $skip reads from $file\n";
5874 sleep (1);
5875 }
5876 if ($upto){
5877 warn "Processing reads up to sequence no. $upto from $file\n";
5878 sleep (1);
5879 }
5880
5881 my $C_to_T_infile = my $G_to_A_infile = $filename;
5882
5883 if ($prefix){
5884 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
5885 $C_to_T_infile = "$prefix.$C_to_T_infile";
5886 $G_to_A_infile = "$prefix.$G_to_A_infile";
5887 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
5888 }
5889
5890 if ($pbat){ # PBAT-Seq
5891 if ($gzip){
5892 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
5893 }
5894 else{
5895 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
5896 }
5897
5898 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
5899
5900 if ($gzip){
5901 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
5902 }
5903 else{
5904 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
5905 }
5906 }
5907 else{ # directional or non-directional
5908 if ($gzip){
5909 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
5910 }
5911 else{
5912 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
5913 }
5914
5915 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
5916
5917 if ($gzip){
5918 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
5919 }
5920 else{
5921 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option
5922 }
5923
5924 unless ($directional){
5925 if ($gzip){
5926 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
5927 }
5928 else{
5929 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
5930 }
5931
5932 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
5933
5934 if ($gzip){
5935 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
5936 }
5937 else{
5938 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
5939 }
5940 }
5941 }
5942
5943 my $count = 0;
5944 while (1){
5945 my $identifier = <IN>;
5946 my $sequence = <IN>;
5947 my $identifier2 = <IN>;
5948 my $quality_score = <IN>;
5949 last unless ($identifier and $sequence and $identifier2 and $quality_score);
5950
5951 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
5952
5953 ++$count;
5954
5955 if ($skip){
5956 next unless ($count > $skip);
5957 }
5958 if ($upto){
5959 last if ($count > $upto);
5960 }
5961
5962 $sequence = uc$sequence; # make input file case insensitive
5963
5964 # detecting if the input file contains tab stops, as this is likely to result in no alignments
5965 if (index($identifier,"\t") != -1){
5966 $seqID_contains_tabs++;
5967 }
5968
5969 ## small check if the sequence file appears to be a FastQ file
5970 if ($count == 1){
5971 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
5972 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
5973 }
5974 }
5975
5976 if ($pbat){
5977 my $sequence_G_to_A = $sequence;
5978 $sequence_G_to_A =~ tr/G/A/;
5979 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
5980 }
5981 else{ # directional or non-directional
5982 my $sequence_C_to_T = $sequence;
5983 $sequence_C_to_T =~ tr/C/T/;
5984 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
5985
5986 unless ($directional){
5987 my $sequence_G_to_A = $sequence;
5988 $sequence_G_to_A =~ tr/G/A/;
5989 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
5990 }
5991 }
5992 }
5993
5994 if ($directional){
5995 close CTOT or die "Failed to close filehandle $!\n";
5996 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
5997 }
5998 elsif($pbat){
5999 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
6000 close GTOA or die "Failed to close filehandle $!\n";
6001 return ($G_to_A_infile);
6002 }
6003 else{
6004 close CTOT or die "Failed to close filehandle $!\n";
6005 close GTOA or die "Failed to close filehandle $!\n";
6006 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
6007 }
6008
6009 return ($C_to_T_infile,$G_to_A_infile);
6010 }
6011
6012 sub biTransformFastQFiles_paired_end {
6013 my ($file,$read_number) = @_;
6014 my ($dir,$filename);
6015
6016 if ($file =~ /\//){
6017 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
6018 }
6019 else{
6020 $filename = $file;
6021 }
6022
6023 ### gzipped version of the infile
6024 if ($file =~ /\.gz$/){
6025 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
6026 }
6027 else{
6028 open (IN,$file) or die "Couldn't read from file $file: $!\n";
6029 }
6030
6031 if ($skip){
6032 warn "Skipping the first $skip reads from $file\n";
6033 sleep (1);
6034 }
6035 if ($upto){
6036 warn "Processing reads up to sequence no. $upto from $file\n";
6037 sleep (1);
6038 }
6039
6040 my $C_to_T_infile = my $G_to_A_infile = $filename;
6041
6042 if ($gzip){
6043 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/;
6044 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/;
6045 }
6046 else{
6047 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
6048 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
6049 }
6050
6051 if ($prefix){
6052 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n";
6053 $C_to_T_infile = "$prefix.$C_to_T_infile";
6054 $G_to_A_infile = "$prefix.$G_to_A_infile";
6055 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n";
6056 }
6057
6058 if ($directional){
6059 if ($read_number == 1){
6060 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
6061 if ($gzip){
6062 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
6063 }
6064 else{
6065 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
6066 }
6067 }
6068 elsif ($read_number == 2){
6069 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
6070 if ($gzip){
6071 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
6072 }
6073 else{
6074 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
6075 }
6076 }
6077 else{
6078 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
6079 }
6080 }
6081 else{
6082 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
6083 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
6084 if ($gzip){
6085 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n";
6086 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n";
6087 }
6088 else{
6089 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
6090 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
6091 }
6092 }
6093
6094 my $count = 0;
6095 while (1){
6096 my $identifier = <IN>;
6097 my $sequence = <IN>;
6098 my $identifier2 = <IN>;
6099 my $quality_score = <IN>;
6100 last unless ($identifier and $sequence and $identifier2 and $quality_score);
6101 ++$count;
6102
6103 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
6104
6105 if ($skip){
6106 next unless ($count > $skip);
6107 }
6108 if ($upto){
6109 last if ($count > $upto);
6110 }
6111
6112 $sequence= uc$sequence; # make input file case insensitive
6113
6114 ## small check if the sequence file appears to be a FastQ file
6115 if ($count == 1){
6116 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
6117 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
6118 }
6119 }
6120 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
6121
6122 if ($read_number == 1){
6123 if ($bowtie2){
6124 $identifier =~ s/$/\/1\/1/;
6125 }
6126 else{
6127 $identifier =~ s/$/\/1/;
6128 }
6129 }
6130 elsif ($read_number == 2){
6131 if ($bowtie2){
6132 $identifier =~ s/$/\/2\/2/;
6133 }
6134 else{
6135 $identifier =~ s/$/\/2/;
6136 }
6137 }
6138 else{
6139 die "Read number needs to be 1 or 2\n";
6140 }
6141
6142 $sequence_C_to_T =~ tr/C/T/;
6143 $sequence_G_to_A =~ tr/G/A/;
6144
6145 if ($directional){
6146 if ($read_number == 1){
6147 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
6148 }
6149 else{
6150 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
6151 }
6152 }
6153 else{
6154 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
6155 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
6156 }
6157 }
6158
6159 if ($directional){
6160 if ($read_number == 1){
6161 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
6162 }
6163 else{
6164 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
6165 }
6166 }
6167 else{
6168 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
6169 }
6170 if ($directional){
6171 if ($read_number == 1){
6172 close CTOT or die "Failed to close filehandle $!\n";
6173 return ($C_to_T_infile);
6174 }
6175 else{
6176 close GTOA or die "Failed to close filehandle $!\n";
6177 return ($G_to_A_infile);
6178 }
6179 }
6180 else{
6181 close CTOT or die "Failed to close filehandle $!\n";
6182 close GTOA or die "Failed to close filehandle $!\n";
6183 return ($C_to_T_infile,$G_to_A_infile);
6184 }
6185 }
6186
6187
6188 ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES
6189
6190 sub biTransformFastQFiles_paired_end_bowtie1_gzip {
6191 my ($file_1,$file_2) = @_;
6192 my ($dir,$filename);
6193
6194 if ($file_1 =~ /\//){
6195 ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/;
6196 }
6197 else{
6198 $filename = $file_1;
6199 }
6200
6201 ### gzipped version of infile 1
6202 if ($file_1 =~ /\.gz$/){
6203 open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n";
6204 }
6205 else{
6206 open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n";
6207 }
6208 ### gzipped version of infile 2
6209 if ($file_2 =~ /\.gz$/){
6210 open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n";
6211 }
6212 else{
6213 open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n";
6214 }
6215
6216
6217 if ($skip){
6218 warn "Skipping the first $skip reads from $file_1 and $file_2\n";
6219 sleep (1);
6220 }
6221 if ($upto){
6222 warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n";
6223 sleep (1);
6224 }
6225
6226 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename;
6227
6228 if ($prefix){
6229 # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n";
6230 $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile";
6231 $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile";
6232 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
6233 }
6234
6235 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/;
6236 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/;
6237 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n";
6238
6239 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n";
6240 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n";
6241 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n";
6242
6243 unless ($directional){
6244 print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n";
6245 open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n";
6246 }
6247
6248 ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format:
6249 ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate>
6250
6251 my $count = 0;
6252 while (1){
6253 my $identifier_1 = <IN_1>;
6254 my $sequence_1 = <IN_1>;
6255 my $identifier2_1 = <IN_1>;
6256 my $quality_score_1 = <IN_1>;
6257
6258 my $identifier_2 = <IN_2>;
6259 my $sequence_2 = <IN_2>;
6260 my $identifier2_2 = <IN_2>;
6261 my $quality_score_2 = <IN_2>;
6262
6263 last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2);
6264
6265 ++$count;
6266
6267 ## small check if the sequence file appears to be a FastQ file
6268 if ($count == 1){
6269 if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){
6270 die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n";
6271 }
6272 if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){
6273 die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n";
6274 }
6275 }
6276
6277 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
6278 chomp $identifier_1;
6279 chomp $sequence_1;
6280 chomp $sequence_2;
6281 chomp $quality_score_1;
6282 chomp $quality_score_2;
6283
6284 $identifier_1 =~ s/^\@//;
6285 $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever)
6286
6287 if ($skip){
6288 next unless ($count > $skip);
6289 }
6290 if ($upto){
6291 last if ($count > $upto);
6292 }
6293
6294 $sequence_1 = uc$sequence_1; # make input file 1 case insensitive
6295 $sequence_2 = uc$sequence_2; # make input file 2 case insensitive
6296
6297 # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n";
6298 my $sequence_1_C_to_T = $sequence_1;
6299 my $sequence_2_G_to_A = $sequence_2;
6300 $sequence_1_C_to_T =~ tr/C/T/;
6301 $sequence_2_G_to_A =~ tr/G/A/;
6302
6303 print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n";
6304
6305 unless ($directional){
6306 my $sequence_1_G_to_A = $sequence_1;
6307 my $sequence_2_C_to_T = $sequence_2;
6308 $sequence_1_G_to_A =~ tr/G/A/;
6309 $sequence_2_C_to_T =~ tr/C/T/;
6310 print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n";
6311 }
6312 }
6313
6314 close CTPLUSGA or die "Couldn't close filehandle\n";
6315 warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n";
6316
6317 if ($directional){
6318 warn "\n";
6319 return ($CT_plus_GA_infile);
6320 }
6321 else{
6322 close GAPLUSCT or die "Couldn't close filehandle\n";
6323 warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n";
6324 return ($CT_plus_GA_infile,$GA_plus_CT_infile);
6325 }
6326 }
6327
6328
6329 sub fix_IDs{
6330 my $id = shift;
6331 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
6332 return $id;
6333 }
6334
6335 sub ensure_sensical_alignment_orientation_single_end{
6336 my $index = shift; # index number if the sequence produced an alignment
6337 my $strand = shift;
6338 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
6339 my $orientation = 0;
6340 ##############################################################################################################
6341 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
6342 ## here we only want reads in the forward (+) orientation
6343 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
6344 ### if the alignment is (+) we count it, and return 1 for a correct orientation
6345 if ($strand eq '+') {
6346 $fhs[$index]->{seen}++;
6347 $orientation = 1;
6348 return $orientation;
6349 }
6350 ### if the orientation equals (-) the alignment is nonsensical
6351 elsif ($strand eq '-') {
6352 $fhs[$index]->{wrong_strand}++;
6353 return $orientation;
6354 }
6355 }
6356 ###############################################################################################################
6357 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
6358 ## here we only want reads in the forward (-) orientation
6359 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
6360 ### if the alignment is (-) we count it and return 1 for a correct orientation
6361 if ($strand eq '-') {
6362 $fhs[$index]->{seen}++;
6363 $orientation = 1;
6364 return $orientation;
6365 }
6366 ### if the orientation equals (+) the alignment is nonsensical
6367 elsif ($strand eq '+') {
6368 $fhs[$index]->{wrong_strand}++;
6369 return $orientation;
6370 }
6371 }
6372 ###############################################################################################################
6373 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
6374 ## here we only want reads in the forward (-) orientation
6375 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
6376 ### if the alignment is (-) we count it and return 1 for a correct orientation
6377 if ($strand eq '-') {
6378 $fhs[$index]->{seen}++;
6379 $orientation = 1;
6380 return $orientation;
6381 }
6382 ### if the orientation equals (+) the alignment is nonsensical
6383 elsif ($strand eq '+') {
6384 $fhs[$index]->{wrong_strand}++;
6385 return $orientation;
6386 }
6387 }
6388 ###############################################################################################################
6389 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
6390 ## here we only want reads in the forward (+) orientation
6391 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
6392 ### if the alignment is (+) we count it and return 1 for a correct orientation
6393 if ($strand eq '+') {
6394 $fhs[$index]->{seen}++;
6395 $orientation = 1;
6396 return $orientation;
6397 }
6398 ### if the orientation equals (-) the alignment is nonsensical
6399 elsif ($strand eq '-') {
6400 $fhs[$index]->{wrong_strand}++;
6401 return $orientation;
6402 }
6403 } else{
6404 die "One of the above conditions must be true\n";
6405 }
6406 }
6407
6408 sub ensure_sensical_alignment_orientation_paired_ends{
6409 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
6410 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
6411 my $orientation = 0;
6412 ##############################################################################################################
6413 ## [Index 0, sequence originated from (converted) forward strand]
6414 ## CT converted read 1
6415 ## GA converted read 2
6416 ## CT converted genome
6417 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
6418 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
6419 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
6420 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
6421 $fhs[$index]->{seen}++;
6422 $orientation = 1;
6423 return $orientation;
6424 }
6425 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
6426 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
6427 $fhs[$index]->{wrong_strand}++;
6428 return $orientation;
6429 }
6430 else{
6431 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
6432 }
6433 }
6434 ###############################################################################################################
6435 ## [Index 1, sequence originated from (converted) reverse strand]
6436 ## GA converted read 1
6437 ## CT converted read 2
6438 ## GA converted genome
6439 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
6440 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
6441 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
6442 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
6443 $fhs[$index]->{seen}++;
6444 $orientation = 1;
6445 return $orientation;
6446 }
6447 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
6448 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
6449 $fhs[$index]->{wrong_strand}++;
6450 return $orientation;
6451 }
6452 else{
6453 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
6454 }
6455 }
6456 ###############################################################################################################
6457 ## [Index 2, sequence originated from complementary to (converted) forward strand]
6458 ## GA converted read 1
6459 ## CT converted read 2
6460 ## CT converted genome
6461 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
6462 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
6463 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
6464 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
6465 $fhs[$index]->{seen}++;
6466 $orientation = 1;
6467 return $orientation;
6468 }
6469 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
6470 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
6471 $fhs[$index]->{wrong_strand}++;
6472 return $orientation;
6473 }
6474 else{
6475 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
6476 }
6477 }
6478 ###############################################################################################################
6479 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
6480 ## CT converted read 1
6481 ## GA converted read 2
6482 ## GA converted genome
6483 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
6484 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
6485 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
6486 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
6487 $fhs[$index]->{seen}++;
6488 $orientation = 1;
6489 return $orientation;
6490 }
6491 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
6492 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
6493 $fhs[$index]->{wrong_strand}++;
6494 return $orientation;
6495 }
6496 else{
6497 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
6498 }
6499 }
6500 else{
6501 die "One of the above conditions must be true\n";
6502 }
6503 }
6504
6505 #####################################################################################################################################################
6506
6507 ### Bowtie 1 (default) | PAIRED-END | FASTA
6508
6509 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
6510
6511 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
6512
6513 if ($directional){
6514 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
6515 }
6516 else{
6517 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
6518 }
6519
6520 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
6521 ## data structure above
6522 if ($directional){
6523 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6524 }
6525 else{
6526 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6527 }
6528
6529 foreach my $fh (@fhs) {
6530
6531 if ($directional){
6532 unless ($fh->{inputfile_1}){
6533 $fh->{last_seq_id} = undef;
6534 $fh->{last_line_1} = undef;
6535 $fh->{last_line_2} = undef;
6536 next;
6537 }
6538 }
6539
6540 my $bt_options = $bowtie_options;
6541 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
6542 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
6543 }
6544 else {
6545 $bt_options .= ' --nofw';
6546 }
6547
6548 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
6549 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
6550
6551 my $line_1 = $fh->{fh}->getline();
6552 my $line_2 = $fh->{fh}->getline();
6553
6554 # if Bowtie produces an alignment we store the first line of the output
6555 if ($line_1 and $line_2) {
6556 chomp $line_1;
6557 chomp $line_2;
6558 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
6559 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
6560
6561 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
6562 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
6563
6564 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
6565 $fh->{last_seq_id} = $id_1;
6566 }
6567 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
6568 $fh->{last_seq_id} = $id_2;
6569 }
6570 else{
6571 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
6572 }
6573
6574 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
6575 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
6576 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
6577 }
6578 # otherwise we just initialise last_seq_id and last_lines as undefined
6579 else {
6580 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
6581 $fh->{last_seq_id} = undef;
6582 $fh->{last_line_1} = undef;
6583 $fh->{last_line_2} = undef;
6584 }
6585 }
6586 }
6587
6588 ### Bowtie 2 | PAIRED-END | FASTA
6589
6590 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
6591 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
6592 if ($directional){
6593 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
6594 }
6595 else{
6596 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
6597 }
6598
6599 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
6600 ## data structure above
6601 if ($directional){
6602 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6603 }
6604 else{
6605 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6606 }
6607
6608 foreach my $fh (@fhs) {
6609
6610 if ($directional){
6611 unless ($fh->{inputfile_1}){
6612 $fh->{last_seq_id} = undef;
6613 $fh->{last_line_1} = undef;
6614 $fh->{last_line_2} = undef;
6615 next;
6616 }
6617 }
6618
6619 my $bt2_options = $bowtie_options;
6620 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
6621 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
6622 }
6623 else {
6624 $bt2_options .= ' --nofw';
6625 }
6626
6627 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
6628 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
6629
6630 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
6631 while (1){
6632 $_ = $fh->{fh}->getline();
6633 if ($_) {
6634 last unless ($_ =~ /^\@/); # SAM headers start with @
6635 }
6636 else{
6637 last; # no alignment output
6638 }
6639 }
6640
6641 my $line_1 = $_;
6642 my $line_2 = $fh->{fh}->getline();
6643
6644 # if Bowtie produces an alignment we store the first line of the output
6645 if ($line_1 and $line_2) {
6646 chomp $line_1;
6647 chomp $line_2;
6648 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
6649 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
6650
6651 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
6652 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
6653
6654 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
6655 $fh->{last_seq_id} = $id_1;
6656 }
6657 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
6658 $fh->{last_seq_id} = $id_2;
6659 }
6660 else{
6661 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
6662 }
6663
6664 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
6665 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
6666 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
6667 }
6668 # otherwise we just initialise last_seq_id and last_lines as undefined
6669 else {
6670 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
6671 $fh->{last_seq_id} = undef;
6672 $fh->{last_line_1} = undef;
6673 $fh->{last_line_2} = undef;
6674 }
6675 }
6676 }
6677
6678 ### Bowtie 1 (default) | PAIRED-END | FASTQ
6679
6680 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
6681 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
6682
6683 if ($directional){
6684 warn "Input file is $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
6685 }
6686 elsif($pbat){
6687 warn "Input file is $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ; PBAT-Seq)\n";
6688 }
6689 else{
6690 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 and $G_to_A_infile_1 and $C_to_T_infile_2 (non-directional; FastQ)\n";
6691 }
6692
6693 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the data structure above
6694 if ($directional or $pbat){
6695 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6696 }
6697 else{
6698 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6699 }
6700
6701 foreach my $fh (@fhs) {
6702
6703 if ($directional or $pbat){
6704 unless ($fh->{inputfile_1}){
6705 $fh->{last_seq_id} = undef;
6706 $fh->{last_line_1} = undef;
6707 $fh->{last_line_2} = undef;
6708 next; # skipping unwanted filehandles
6709 }
6710 }
6711
6712 my $bt_options = $bowtie_options;
6713 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
6714 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
6715 }
6716 else {
6717 $bt_options .= ' --nofw';
6718 }
6719
6720 if ($gzip){
6721 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n";
6722 open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!";
6723 }
6724 else{
6725 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n";
6726 sleep(5);
6727 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
6728 }
6729
6730 my $line_1 = $fh->{fh}->getline();
6731 my $line_2 = $fh->{fh}->getline();
6732
6733 # if Bowtie produces an alignment we store the first line of the output
6734 if ($line_1 and $line_2) {
6735 chomp $line_1;
6736 chomp $line_2;
6737 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
6738 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
6739
6740 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
6741 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
6742
6743 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
6744 $fh->{last_seq_id} = $id_1;
6745 }
6746 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
6747 $fh->{last_seq_id} = $id_2;
6748 }
6749 else{
6750 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
6751 }
6752
6753 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
6754 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
6755 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
6756 }
6757
6758 # otherwise we just initialise last_seq_id and last_lines as undefined
6759 else {
6760 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
6761 $fh->{last_seq_id} = undef;
6762 $fh->{last_line_1} = undef;
6763 $fh->{last_line_2} = undef;
6764 }
6765 }
6766 }
6767
6768 ### Bowtie 2 | PAIRED-END | FASTQ
6769
6770 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
6771
6772 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
6773 if ($directional){
6774 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
6775 }
6776 elsif ($pbat){
6777 warn "Input files are $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ)\n";
6778 }
6779 else{
6780 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
6781 }
6782
6783 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
6784 ## data structure above
6785 if ($directional or $pbat){
6786 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6787 }
6788 else{
6789 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6790 }
6791
6792 foreach my $fh (@fhs) {
6793
6794 if ($directional or $pbat){ # skipping unwanted filehandles
6795 unless ($fh->{inputfile_1}){
6796 $fh->{last_seq_id} = undef;
6797 $fh->{last_line_1} = undef;
6798 $fh->{last_line_2} = undef;
6799 next;
6800 }
6801 }
6802
6803 my $bt2_options = $bowtie_options;
6804 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
6805 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
6806 }
6807 else {
6808 $bt2_options .= ' --nofw';
6809 }
6810
6811 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
6812 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
6813
6814 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
6815 while (1){
6816 $_ = $fh->{fh}->getline();
6817 if ($_) {
6818 last unless ($_ =~ /^\@/); # SAM headers start with @
6819 }
6820 else{
6821 last; # no alignment output
6822 }
6823 }
6824
6825 my $line_1 = $_;
6826 my $line_2 = $fh->{fh}->getline();
6827
6828 # if Bowtie produces an alignment we store the first line of the output
6829 if ($line_1 and $line_2) {
6830 chomp $line_1;
6831 chomp $line_2;
6832 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
6833 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
6834
6835 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
6836 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
6837
6838 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
6839 $fh->{last_seq_id} = $id_1;
6840 }
6841 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
6842 $fh->{last_seq_id} = $id_2;
6843 }
6844 else{
6845 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
6846 }
6847
6848 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
6849 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
6850 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
6851 }
6852
6853 # otherwise we just initialise last_seq_id and last_lines as undefined
6854 else {
6855 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n";
6856 $fh->{last_seq_id} = undef;
6857 $fh->{last_line_1} = undef;
6858 $fh->{last_line_2} = undef;
6859 }
6860 }
6861 }
6862
6863 #####################################################################################################################################################
6864
6865 ### Bowtie 1 (default) | SINGLE-END | FASTA
6866 sub single_end_align_fragments_to_bisulfite_genome_fastA {
6867 my ($C_to_T_infile,$G_to_A_infile) = @_;
6868 if ($directional){
6869 warn "Input file is $C_to_T_infile (FastA)\n";
6870 }
6871 else{
6872 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
6873 }
6874
6875 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
6876 ## data structure above
6877 if ($directional){
6878 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6879 }
6880 else{
6881 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6882 }
6883
6884 foreach my $fh (@fhs) {
6885
6886 my $bt_options = $bowtie_options;
6887 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
6888 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
6889 }
6890 else {
6891 $bt_options .= ' --nofw';
6892 }
6893
6894 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
6895 if ($gzip){
6896 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
6897 }
6898 else{
6899 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
6900 }
6901
6902 # if Bowtie produces an alignment we store the first line of the output
6903 $_ = $fh->{fh}->getline();
6904 if ($_) {
6905 chomp;
6906 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
6907 $fh->{last_seq_id} = $id;
6908 $fh->{last_line} = $_;
6909 warn "Found first alignment:\t$fh->{last_line}\n";
6910 }
6911 # otherwise we just initialise last_seq_id and last_line as undefined
6912 else {
6913 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
6914 $fh->{last_seq_id} = undef;
6915 $fh->{last_line} = undef;
6916 }
6917 }
6918 }
6919
6920 ### Bowtie 2 | SINGLE-END | FASTA
6921 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
6922 my ($C_to_T_infile,$G_to_A_infile) = @_;
6923 if ($directional){
6924 warn "Input file is $C_to_T_infile (FastA)\n";
6925 }
6926 else{
6927 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
6928 }
6929
6930 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
6931 ## data structure above
6932 if ($directional){
6933 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6934 }
6935 else{
6936 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6937 }
6938
6939 foreach my $fh (@fhs) {
6940
6941 my $bt2_options = $bowtie_options;
6942 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
6943 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
6944 }
6945 else {
6946 $bt2_options .= ' --nofw';
6947 }
6948
6949 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
6950 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie 2: $!";
6951
6952 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
6953 while (1){
6954 $_ = $fh->{fh}->getline();
6955 if ($_) {
6956 last unless ($_ =~ /^\@/); # SAM headers start with @
6957 }
6958 else{
6959 last; # no alignment output
6960 }
6961 }
6962
6963 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
6964 if ($_) {
6965 chomp;
6966 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
6967 $fh->{last_seq_id} = $id;
6968 $fh->{last_line} = $_;
6969 warn "Found first alignment:\t$fh->{last_line}\n";
6970 }
6971 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
6972 else {
6973 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
6974 $fh->{last_seq_id} = undef;
6975 $fh->{last_line} = undef;
6976 }
6977 }
6978 }
6979
6980
6981 ### Bowtie 1 (default) | SINGLE-END | FASTQ
6982 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
6983 my ($C_to_T_infile,$G_to_A_infile) = @_;
6984 if ($directional){
6985 warn "Input file is $C_to_T_infile (FastQ)\n";
6986 }
6987 elsif($pbat){
6988 warn "Input file is $G_to_A_infile (FastQ)\n";
6989 }
6990 else{
6991 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
6992 }
6993
6994
6995 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
6996 ## the data structure above
6997 if ($directional or $pbat){
6998 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
6999 }
7000 else{
7001 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
7002 }
7003
7004 foreach my $fh (@fhs) {
7005 my $bt_options = $bowtie_options;
7006 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
7007 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
7008 }
7009 else {
7010 $bt_options .= ' --nofw';
7011 }
7012
7013 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
7014 sleep (5);
7015
7016 if ($gzip){
7017 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!";
7018 }
7019 else{
7020 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data
7021 }
7022
7023 # if Bowtie produces an alignment we store the first line of the output
7024 $_ = $fh->{fh}->getline();
7025 if ($_) {
7026 chomp;
7027 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
7028 $fh->{last_seq_id} = $id;
7029 $fh->{last_line} = $_;
7030 warn "Found first alignment:\t$fh->{last_line}\n";
7031 }
7032 # otherwise we just initialise last_seq_id and last_line as undefined
7033 else {
7034 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
7035 $fh->{last_seq_id} = undef;
7036 $fh->{last_line} = undef;
7037 }
7038 }
7039 }
7040
7041 ### Bowtie 2 | SINGLE-END | FASTQ
7042 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
7043
7044 my ($C_to_T_infile,$G_to_A_infile) = @_;
7045 if ($directional){
7046 warn "Input file is $C_to_T_infile (FastQ)\n\n";
7047 }
7048 elsif ($pbat){
7049 warn "Input file is $G_to_A_infile (FastQ)\n\n";
7050 }
7051 else{
7052 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
7053 }
7054
7055 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
7056 ## the data structure above
7057 if ($directional or $pbat){
7058 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
7059 }
7060 else{
7061 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
7062 }
7063
7064 foreach my $fh (@fhs) {
7065 my $bt2_options = $bowtie_options;
7066 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
7067 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
7068 }
7069 else {
7070 $bt2_options .= ' --nofw';
7071 }
7072 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
7073 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
7074
7075 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
7076 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
7077 while (1){
7078 $_ = $fh->{fh}->getline();
7079 # warn "$_\n";
7080 # sleep(1);
7081 if ($_) {
7082 last unless ($_ =~ /^\@/); # SAM headers start with @
7083 }
7084 else {
7085 last;
7086 }
7087 }
7088
7089 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
7090 if ($_) {
7091 chomp;
7092 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
7093 $fh->{last_seq_id} = $id;
7094 $fh->{last_line} = $_;
7095 warn "Found first alignment:\t$fh->{last_line}\n";
7096 # warn "storing $id and\n$_\n";
7097 }
7098 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
7099 else {
7100 warn "Found no alignment, assigning undef to last_seq_id and last_line\n";
7101 $fh->{last_seq_id} = undef;
7102 $fh->{last_line} = undef;
7103 }
7104 }
7105 }
7106
7107 ###########################################################################################################################################
7108
7109 sub reset_counters_and_fhs{
7110 my $filename = shift;
7111 %counting=(
7112 total_meCHH_count => 0,
7113 total_meCHG_count => 0,
7114 total_meCpG_count => 0,
7115 total_meC_unknown_count => 0,
7116 total_unmethylated_CHH_count => 0,
7117 total_unmethylated_CHG_count => 0,
7118 total_unmethylated_CpG_count => 0,
7119 total_unmethylated_C_unknown_count => 0,
7120 sequences_count => 0,
7121 no_single_alignment_found => 0,
7122 unsuitable_sequence_count => 0,
7123 genomic_sequence_could_not_be_extracted_count => 0,
7124 unique_best_alignment_count => 0,
7125 low_complexity_alignments_overruled_count => 0,
7126 CT_CT_count => 0, #(CT read/CT genome, original top strand)
7127 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
7128 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
7129 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
7130 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
7131 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
7132 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
7133 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
7134 alignments_rejected_count => 0, # only relevant if --directional was specified
7135 );
7136
7137 if ($directional){
7138 if ($filename =~ ','){ # paired-end files
7139 @fhs=(
7140 { name => 'CTreadCTgenome',
7141 strand_identity => 'con ori forward',
7142 bisulfiteIndex => $CT_index_basename,
7143 seen => 0,
7144 wrong_strand => 0,
7145 },
7146 { name => 'CTreadGAgenome',
7147 strand_identity => 'con ori reverse',
7148 bisulfiteIndex => $GA_index_basename,
7149 seen => 0,
7150 wrong_strand => 0,
7151 },
7152 { name => 'GAreadCTgenome',
7153 strand_identity => 'compl ori con forward',
7154 bisulfiteIndex => $CT_index_basename,
7155 seen => 0,
7156 wrong_strand => 0,
7157 },
7158 { name => 'GAreadGAgenome',
7159 strand_identity => 'compl ori con reverse',
7160 bisulfiteIndex => $GA_index_basename,
7161 seen => 0,
7162 wrong_strand => 0,
7163 },
7164 );
7165 }
7166 else{ # single-end files
7167 @fhs=(
7168 { name => 'CTreadCTgenome',
7169 strand_identity => 'con ori forward',
7170 bisulfiteIndex => $CT_index_basename,
7171 seen => 0,
7172 wrong_strand => 0,
7173 },
7174 { name => 'CTreadGAgenome',
7175 strand_identity => 'con ori reverse',
7176 bisulfiteIndex => $GA_index_basename,
7177 seen => 0,
7178 wrong_strand => 0,
7179 },
7180 );
7181 }
7182 }
7183 elsif($pbat){
7184 if ($filename =~ ','){ # paired-end files
7185 @fhs=(
7186 { name => 'CTreadCTgenome',
7187 strand_identity => 'con ori forward',
7188 bisulfiteIndex => $CT_index_basename,
7189 seen => 0,
7190 wrong_strand => 0,
7191 },
7192 { name => 'CTreadGAgenome',
7193 strand_identity => 'con ori reverse',
7194 bisulfiteIndex => $GA_index_basename,
7195 seen => 0,
7196 wrong_strand => 0,
7197 },
7198 { name => 'GAreadCTgenome',
7199 strand_identity => 'compl ori con forward',
7200 bisulfiteIndex => $CT_index_basename,
7201 seen => 0,
7202 wrong_strand => 0,
7203 },
7204 { name => 'GAreadGAgenome',
7205 strand_identity => 'compl ori con reverse',
7206 bisulfiteIndex => $GA_index_basename,
7207 seen => 0,
7208 wrong_strand => 0,
7209 },
7210 );
7211 }
7212 else{ # single-end files
7213 @fhs=(
7214 { name => 'GAreadCTgenome',
7215 strand_identity => 'compl ori con forward',
7216 bisulfiteIndex => $CT_index_basename,
7217 seen => 0,
7218 wrong_strand => 0,
7219 },
7220 { name => 'GAreadGAgenome',
7221 strand_identity => 'compl ori con reverse',
7222 bisulfiteIndex => $GA_index_basename,
7223 seen => 0,
7224 wrong_strand => 0,
7225 },
7226 );
7227 }
7228 }
7229 else{
7230 @fhs=(
7231 { name => 'CTreadCTgenome',
7232 strand_identity => 'con ori forward',
7233 bisulfiteIndex => $CT_index_basename,
7234 seen => 0,
7235 wrong_strand => 0,
7236 },
7237 { name => 'CTreadGAgenome',
7238 strand_identity => 'con ori reverse',
7239 bisulfiteIndex => $GA_index_basename,
7240 seen => 0,
7241 wrong_strand => 0,
7242 },
7243 { name => 'GAreadCTgenome',
7244 strand_identity => 'compl ori con forward',
7245 bisulfiteIndex => $CT_index_basename,
7246 seen => 0,
7247 wrong_strand => 0,
7248 },
7249 { name => 'GAreadGAgenome',
7250 strand_identity => 'compl ori con reverse',
7251 bisulfiteIndex => $GA_index_basename,
7252 seen => 0,
7253 wrong_strand => 0,
7254 },
7255 );
7256 }
7257 }
7258
7259
7260 sub process_command_line{
7261 my @bowtie_options;
7262 my $help;
7263 my $mates1;
7264 my $mates2;
7265 my $path_to_bowtie;
7266 my $fastq;
7267 my $fasta;
7268 my $skip;
7269 my $qupto;
7270 my $phred64;
7271 my $phred33;
7272 my $solexa;
7273 my $mismatches;
7274 my $seed_length;
7275 my $best;
7276 my $sequence_format;
7277 my $version;
7278 my $quiet;
7279 my $chunk;
7280 my $non_directional;
7281 my $ceiling;
7282 my $maxins;
7283 my $minins;
7284 my $unmapped;
7285 my $multi_map;
7286 my $output_dir;
7287 my $bowtie2;
7288 my $vanilla;
7289 my $sam_no_hd;
7290 my $seed_extension_fails;
7291 my $reseed_repetitive_seeds;
7292 my $most_valid_alignments;
7293 my $score_min;
7294 my $parallel;
7295 my $temp_dir;
7296 my $rdg;
7297 my $rfg;
7298 my $non_bs_mm;
7299 my $samtools_path;
7300 my $bam;
7301 my $gzip;
7302 my $pbat;
7303 my $prefix;
7304 my $old_flag;
7305 my $basename;
7306 my $sam;
7307 my $multicore;
7308
7309 my $command_line = GetOptions ('help|man' => \$help,
7310 '1=s' => \$mates1,
7311 '2=s' => \$mates2,
7312 'path_to_bowtie=s' => \$path_to_bowtie,
7313 'f|fasta' => \$fasta,
7314 'q|fastq' => \$fastq,
7315 's|skip=i' => \$skip,
7316 'u|upto=i' => \$qupto,
7317 'phred33-quals' => \$phred33,
7318 'phred64-quals|solexa1' => \$phred64,
7319 'solexa-quals' => \$solexa,
7320 'n|seedmms=i' => \$mismatches,
7321 'l|seedlen=i' => \$seed_length,
7322 'no_best' => \$best,
7323 'version' => \$version,
7324 'quiet' => \$quiet,
7325 'chunkmbs=i' => \$chunk,
7326 'non_directional' => \$non_directional,
7327 'I|minins=i' => \$minins,
7328 'X|maxins=i' => \$maxins,
7329 'e|maqerr=i' => \$ceiling,
7330 'un|unmapped' => \$unmapped,
7331 'ambiguous' => \$multi_map,
7332 'o|output_dir=s' => \$output_dir,
7333 'bowtie2' => \$bowtie2,
7334 'vanilla' => \$vanilla,
7335 'sam-no-hd' => \$sam_no_hd,
7336 'D=i' => \$seed_extension_fails,
7337 'R=i' => \$reseed_repetitive_seeds,
7338 'score_min=s' => \$score_min,
7339 'most_valid_alignments=i' => \$most_valid_alignments,
7340 'p=i' => \$parallel,
7341 'temp_dir=s' => \$temp_dir,
7342 'rdg=s' => \$rdg,
7343 'rfg=s' => \$rfg,
7344 'non_bs_mm' => \$non_bs_mm,
7345 'samtools_path=s' => \$samtools_path,
7346 'bam' => \$bam,
7347 'gzip' => \$gzip,
7348 'pbat' => \$pbat,
7349 'prefix=s' => \$prefix,
7350 'old_flag' => \$old_flag,
7351 'B|basename=s' => \$basename,
7352 'sam' => \$sam,
7353 'multicore=i' => \$multicore,
7354 );
7355
7356
7357 ### EXIT ON ERROR if there were errors with any of the supplied options
7358 unless ($command_line){
7359 die "Please respecify command line options\n";
7360 }
7361 ### HELPFILE
7362 if ($help){
7363 print_helpfile();
7364 exit;
7365 }
7366 if ($version){
7367 print << "VERSION";
7368
7369
7370 Bismark - Bisulfite Mapper and Methylation Caller.
7371
7372 Bismark Version: $bismark_version
7373 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics
7374 www.bioinformatics.babraham.ac.uk/projects/
7375
7376
7377 VERSION
7378 exit;
7379 }
7380
7381
7382 ##########################
7383 ### PROCESSING OPTIONS ###
7384 ##########################
7385
7386 unless ($bowtie2){
7387 $bowtie2 = 0;
7388 }
7389 unless ($sam_no_hd){
7390 $sam_no_hd =0;
7391 }
7392
7393 ### PATH TO BOWTIE
7394 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
7395 if ($path_to_bowtie){
7396 unless ($path_to_bowtie =~ /\/$/){
7397 $path_to_bowtie =~ s/$/\//;
7398 }
7399 if (-d $path_to_bowtie){
7400 if ($bowtie2){
7401 $path_to_bowtie = "${path_to_bowtie}bowtie2";
7402 }
7403 else{
7404 $path_to_bowtie = "${path_to_bowtie}bowtie";
7405 }
7406 }
7407 else{
7408 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
7409 }
7410 }
7411 else{
7412 if ($bowtie2){
7413 $path_to_bowtie = 'bowtie2';
7414 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
7415 else{
7416 $path_to_bowtie = 'bowtie';
7417 warn "Path to Bowtie specified as: $path_to_bowtie\n";
7418 }
7419 }
7420
7421
7422 if ($sam){
7423 warn "Output format manually set as SAM\n";
7424 }
7425 else{
7426 $bam = 1;
7427 warn "Output format is BAM (default)\n";
7428 }
7429
7430 ### OUTPUT REQUESTED AS BAM FILE (default)
7431 if ($bam){
7432 if ($vanilla){
7433 die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n";
7434 }
7435
7436 ### PATH TO SAMTOOLS
7437 if (defined $samtools_path){
7438 # if Samtools was specified as full command
7439 if ($samtools_path =~ /samtools$/){
7440 if (-e $samtools_path){
7441 # Samtools executable found
7442 }
7443 else{
7444 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
7445 }
7446 }
7447 else{
7448 unless ($samtools_path =~ /\/$/){
7449 $samtools_path =~ s/$/\//;
7450 }
7451 $samtools_path .= 'samtools';
7452 if (-e $samtools_path){
7453 # Samtools executable found
7454 }
7455 else{
7456 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
7457 }
7458 }
7459
7460 warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n";
7461 $bam = 1;
7462 }
7463 # Check whether Samtools is in the PATH if no path was supplied by the user
7464 else{
7465 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH
7466 $samtools_path = `which samtools`;
7467 chomp $samtools_path;
7468 warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n";
7469 $bam = 1;
7470 }
7471 }
7472
7473 unless (defined $samtools_path){
7474 $bam = 2;
7475 warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n";
7476 }
7477 sleep (1);
7478 }
7479
7480
7481 ####################################
7482 ### PROCESSING ARGUMENTS
7483
7484 ### GENOME FOLDER
7485 my $genome_folder = shift @ARGV; # mandatory
7486 unless ($genome_folder){
7487 warn "Genome folder was not specified!\n";
7488 print_helpfile();
7489 exit;
7490 }
7491
7492 ### checking that the genome folder, all subfolders and the required bowtie index files exist
7493 unless ($genome_folder =~/\/$/){
7494 $genome_folder =~ s/$/\//;
7495 }
7496
7497 if (chdir $genome_folder){
7498 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
7499 unless ($absolute_genome_folder =~/\/$/){
7500 $absolute_genome_folder =~ s/$/\//;
7501 }
7502 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
7503 $genome_folder = $absolute_genome_folder;
7504 }
7505 else{
7506 die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
7507 }
7508
7509 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
7510 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
7511
7512 my $bt2_small_index_present = 1;
7513 my $bt2_large_index_present = 1;
7514
7515 if ($bowtie2){ ### Bowtie 2
7516
7517 ### Checking for small indixes first (ending in .bt2)
7518
7519 # checking the integrity of $CT_dir
7520 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
7521
7522 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
7523 foreach my $file(@CT_bowtie_index){
7524 unless (-f $file){
7525 warn "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n";
7526 $bt2_small_index_present = 0;
7527 }
7528 }
7529 # checking the integrity of $GA_dir
7530 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
7531 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
7532
7533 foreach my $file(@GA_bowtie_index){
7534 unless (-f $file){
7535 warn "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n";
7536 $bt2_small_index_present = 0;
7537 }
7538 }
7539
7540 ### Using the small index preferentially
7541 if ($bt2_small_index_present){
7542 $bt2_large_index_present = 0;
7543 }
7544 else{ # only checking for large indexes if the 'normal' one can't be found
7545 warn "\nCouldn't find a traditional small Bowtie 2 index for the genome specified (ending in .bt2). Now searching for a large index instead (64-bit index ending in .bt2l)...\n";
7546
7547 ### If no small small indexes were found we look for large indexes (64-bit indexes, ending in .bt2l)
7548
7549 # checking the integrity of $CT_dir
7550 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
7551
7552 @CT_bowtie_index = ('BS_CT.1.bt2l','BS_CT.2.bt2l','BS_CT.3.bt2l','BS_CT.4.bt2l','BS_CT.rev.1.bt2l','BS_CT.rev.2.bt2l');
7553 foreach my $file(@CT_bowtie_index){
7554 unless (-f $file){
7555 die "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n";
7556 $bt2_large_index_present = 0; }
7557 }
7558
7559 ### checking the integrity of $GA_dir
7560 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
7561 @GA_bowtie_index = ('BS_GA.1.bt2l','BS_GA.2.bt2l','BS_GA.3.bt2l','BS_GA.4.bt2l','BS_GA.rev.1.bt2l','BS_GA.rev.2.bt2l');
7562
7563 foreach my $file(@GA_bowtie_index){
7564 unless (-f $file){
7565 die "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n";
7566 $bt2_large_index_present = 0;
7567 }
7568 }
7569
7570 if ($bt2_large_index_present){
7571 warn "64-bit large genome Bowtie 2 index found...\n";
7572 }
7573 else{
7574 die "Failed to detect either a standard (.bt2) or 64-bit (.bt2l) Bowtie 2 index for the genome specified. Please run the bismark_genome_preparation before launching Bismark\n\n";
7575 }
7576 }
7577
7578 }
7579
7580 else{ ### Bowtie 1 (default)
7581 ### checking the integrity of $CT_dir
7582 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
7583 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
7584 foreach my $file(@CT_bowtie_index){
7585 unless (-f $file){
7586 die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
7587 }
7588 }
7589 ### checking the integrity of $GA_dir
7590 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
7591 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
7592 foreach my $file(@GA_bowtie_index){
7593 unless (-f $file){
7594 die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n";
7595 }
7596 }
7597 }
7598
7599 my $CT_index_basename = "${CT_dir}BS_CT";
7600 my $GA_index_basename = "${GA_dir}BS_GA";
7601
7602 ### INPUT OPTIONS
7603
7604 ### SEQUENCE FILE FORMAT
7605 ### exits if both fastA and FastQ were specified
7606 if ($fasta and $fastq){
7607 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
7608 }
7609
7610 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
7611 if ($fasta){
7612 print "FastA format specified\n";
7613 $sequence_format = 'FASTA';
7614 push @bowtie_options, '-f';
7615 }
7616 elsif ($fastq){
7617 print "FastQ format specified\n";
7618 $sequence_format = 'FASTQ';
7619 push @bowtie_options, '-q';
7620 }
7621 else{
7622 $fastq = 1;
7623 print "FastQ format assumed (by default)\n";
7624 $sequence_format = 'FASTQ';
7625 push @bowtie_options, '-q';
7626 }
7627
7628 ### SKIP
7629 if ($skip){
7630 warn "Skipping the first $skip reads from the input file\n";
7631 # push @bowtie_options,"-s $skip";
7632 }
7633
7634 ### UPTO
7635 if ($qupto){
7636 warn "Processing sequences up to read no. $qupto from the input file\n";
7637 if ($bowtie2){
7638 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
7639 }
7640 else{
7641 # push @bowtie_options,"--qupto $qupto";
7642 }
7643 }
7644
7645 ### QUALITY VALUES
7646 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
7647 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
7648 }
7649 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
7650 # Phred quality values work only when -q is specified
7651 unless ($fastq){
7652 die "Phred quality values works only when -q (FASTQ) is specified\n";
7653 }
7654 if ($bowtie2){
7655 push @bowtie_options,"--phred33";
7656 }
7657 else{
7658 push @bowtie_options,"--phred33-quals";
7659 }
7660 }
7661 if ($phred64){
7662 # Phred quality values work only when -q is specified
7663 unless ($fastq){
7664 die "Phred quality values work only when -q (FASTQ) is specified\n";
7665 }
7666 if ($bowtie2){
7667 push @bowtie_options,"--phred64";
7668 }
7669 else{
7670 push @bowtie_options,"--phred64-quals";
7671 }
7672 }
7673 else{
7674 $phred64 = 0;
7675 }
7676
7677 if ($solexa){
7678 if ($bowtie2){
7679 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
7680 }
7681 # Solexa to Phred value conversion works only when -q is specified
7682 unless ($fastq){
7683 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
7684 }
7685 push @bowtie_options,"--solexa-quals";
7686 }
7687 else{
7688 $solexa = 0;
7689 }
7690
7691 ### ALIGNMENT OPTIONS
7692
7693 ### MISMATCHES
7694 if (defined $mismatches){
7695 if ($bowtie2){
7696 if ($mismatches == 0 or $mismatches == 1){
7697 push @bowtie_options,"-N $mismatches";
7698 }
7699 else{
7700 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
7701 }
7702 }
7703 else{
7704 if ($mismatches >= 0 and $mismatches <= 3){
7705 push @bowtie_options,"-n $mismatches";
7706 }
7707 else{
7708 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
7709 }
7710 }
7711 }
7712 else{
7713 unless ($bowtie2){
7714 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
7715 }
7716 }
7717
7718 ### SEED LENGTH
7719 if (defined $seed_length){
7720 if ($bowtie2){
7721 push @bowtie_options,"-L $seed_length";
7722 }
7723 else{
7724 push @bowtie_options,"-l $seed_length";
7725 }
7726 }
7727
7728 ### MISMATCH CEILING
7729 if (defined $ceiling){
7730 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
7731 push @bowtie_options,"-e $ceiling";
7732 }
7733
7734
7735 ### BOWTIE 2 EFFORT OPTIONS
7736
7737 ### CONSECUTIVE SEED EXTENSION FAILS
7738 if (defined $seed_extension_fails){
7739 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
7740 push @bowtie_options,"-D $seed_extension_fails";
7741 }
7742
7743 ### RE-SEEDING REPETITIVE SEEDS
7744 if (defined $reseed_repetitive_seeds){
7745 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
7746 push @bowtie_options,"-R $reseed_repetitive_seeds";
7747 }
7748
7749
7750 ### BOWTIE 2 SCORING OPTIONS
7751
7752 my ($score_min_intercept, $score_min_slope);
7753
7754 if ($score_min){
7755 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
7756
7757 unless ($score_min =~ /^L,(.+),(.+)$/){
7758 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
7759 }
7760 ($score_min_intercept, $score_min_slope) = ($1, $2);
7761 push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2
7762 }
7763 else{
7764 if ($bowtie2){
7765 ($score_min_intercept, $score_min_slope) = (0, -0.2);
7766 push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2
7767 }
7768 }
7769
7770 ### BOWTIE 2 READ GAP OPTIONS
7771 my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend);
7772
7773 if ($rdg){
7774 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
7775 if ($rdg =~ /^(\d+),(\d+)$/){
7776 $deletion_open = $1;
7777 $deletion_extend = $2;
7778 }
7779 else{
7780 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
7781 }
7782 push @bowtie_options,"--rdg $rdg";
7783 }
7784 else{
7785 $deletion_open = 5;
7786 $deletion_extend = 3;
7787 }
7788
7789 ### BOWTIE 2 REFERENCE GAP OPTIONS
7790 if ($rfg){
7791 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
7792 if ($rfg =~ /^(\d+),(\d+)$/){
7793 $insertion_open = $1;
7794 $insertion_extend = $2;
7795 }
7796 else{
7797 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
7798 }
7799 push @bowtie_options,"--rfg $rfg";
7800 }
7801 else{
7802 $insertion_open = 5;
7803 $insertion_extend = 3;
7804 }
7805
7806
7807 ### BOWTIE 2 PARALLELIZATION OPTIONS
7808 if (defined $parallel){
7809 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
7810 }
7811 if ($bowtie2){
7812 if ($parallel){
7813 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
7814 if ($parallel > 4){
7815 warn "Attention: using more than 4 cores per alignment thread has been reported to have diminishing returns. If possible try to limit -p to a value of 4\n"; sleep(2);
7816 }
7817 push @bowtie_options,"-p $parallel";
7818 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
7819 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
7820 sleep (2);
7821 }
7822 }
7823
7824 ### REPORTING OPTIONS
7825
7826 if ($bowtie2){
7827 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
7828
7829 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
7830 if(defined $most_valid_alignments){
7831
7832 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
7833 }
7834 }
7835 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
7836 push @bowtie_options,'-k 2';
7837 }
7838
7839 ### --BEST
7840 if ($bowtie2){
7841 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
7842 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
7843 }
7844 }
7845 else{
7846 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
7847 unless ($best){
7848 push @bowtie_options,'--best';
7849 }
7850 }
7851
7852 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
7853 if ($vanilla){
7854 if ($bowtie2){
7855 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
7856 }
7857 }
7858 else{
7859 $vanilla = 0;
7860 }
7861
7862 ### PAIRED-END MAPPING
7863 if ($mates1){
7864 my @mates1 = (split (/,/,$mates1));
7865 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
7866 my @mates2 = (split(/,/,$mates2));
7867 unless (scalar @mates1 == scalar @mates2){
7868 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
7869 }
7870 while (1){
7871 my $mate1 = shift @mates1;
7872 my $mate2 = shift @mates2;
7873 last unless ($mate1 and $mate2);
7874 push @filenames,"$mate1,$mate2";
7875 }
7876 if ($bowtie2){
7877 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
7878 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
7879 }
7880
7881 if ($old_flag){
7882 warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla);
7883 sleep(3);
7884 }
7885 }
7886 elsif ($mates2){
7887 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
7888 }
7889
7890 ### SINGLE-END MAPPING
7891 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
7892 my $singles;
7893 unless ($mates1 and $mates2){
7894 $singles = join (',',@ARGV);
7895 unless ($singles){
7896 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
7897 }
7898 $singles =~ s/\s/,/g;
7899 @filenames = (split(/,/,$singles));
7900 warn "\nFiles to be analysed:\n";
7901 warn "@filenames\n\n";
7902 sleep (3);
7903 }
7904
7905 ### MININUM INSERT SIZE (PAIRED-END ONLY)
7906 if (defined $minins){
7907 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
7908 push @bowtie_options,"--minins $minins";
7909 }
7910
7911 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
7912 if (defined $maxins){
7913 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
7914 push @bowtie_options,"--maxins $maxins";
7915 }
7916 else{
7917 unless ($singles){
7918 push @bowtie_options,'--maxins 500';
7919 }
7920 }
7921
7922 ### QUIET prints nothing besides alignments (suppresses warnings)
7923 if ($quiet){
7924 push @bowtie_options,'--quiet';
7925 }
7926
7927 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
7928 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
7929 if (defined $chunk){
7930 push @bowtie_options,"--chunkmbs $chunk";
7931 }
7932 else{
7933 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
7934 }
7935 }
7936
7937
7938 ### SUMMARY OF ALL BOWTIE OPTIONS
7939 my $bowtie_options = join (' ',@bowtie_options);
7940
7941
7942 ### STRAND-SPECIFIC LIBRARIES
7943 my $directional;
7944 if ($non_directional){
7945 die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat);
7946 warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n";
7947 sleep (1);
7948 $directional = 0;
7949 }
7950 elsif($pbat){
7951 die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip);
7952 die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta);
7953
7954 warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n";
7955 sleep (1);
7956 $directional = 0;
7957 }
7958 else{
7959 warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n";
7960 sleep (1);
7961 $directional = 1; # default behaviour
7962 }
7963
7964 ### UNMAPPED SEQUENCE OUTPUT
7965 $unmapped = 0 unless ($unmapped);
7966
7967 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
7968 $multi_map = 0 unless ($multi_map);
7969
7970
7971 ### OUTPUT DIRECTORY
7972
7973 chdir $parent_dir or die "Failed to move back to current working directory\n";
7974 if ($output_dir){
7975 unless ($output_dir =~ /\/$/){
7976 $output_dir =~ s/$/\//;
7977 }
7978
7979 if (chdir $output_dir){
7980 $output_dir = getcwd; # making the path absolute
7981 unless ($output_dir =~ /\/$/){
7982 $output_dir =~ s/$/\//;
7983 }
7984 }
7985 else{
7986 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
7987 warn "Created output directory $output_dir!\n\n";
7988 chdir $output_dir or die "Failed to move to $output_dir\n";
7989 $output_dir = getcwd; # making the path absolute
7990 unless ($output_dir =~ /\/$/){
7991 $output_dir =~ s/$/\//;
7992 }
7993 }
7994 warn "Output will be written into the directory: $output_dir\n";
7995 }
7996 else{
7997 $output_dir = '';
7998 }
7999
8000 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
8001
8002 chdir $parent_dir or die "Failed to move back to current working directory\n";
8003 if ($temp_dir){
8004 warn "\nUsing temp directory: $temp_dir\n";
8005 unless ($temp_dir =~ /\/$/){
8006 $temp_dir =~ s/$/\//;
8007 }
8008
8009 if (chdir $temp_dir){
8010 $temp_dir = getcwd; # making the path absolute
8011 unless ($temp_dir =~ /\/$/){
8012 $temp_dir =~ s/$/\//;
8013 }
8014 }
8015 else{
8016 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
8017 warn "Created temporary directory $temp_dir!\n\n";
8018 chdir $temp_dir or die "Failed to move to $temp_dir\n";
8019 $temp_dir = getcwd; # making the path absolute
8020 unless ($temp_dir =~ /\/$/){
8021 $temp_dir =~ s/$/\//;
8022 }
8023 }
8024 warn "Temporary files will be written into the directory: $temp_dir\n";
8025 }
8026 else{
8027 $temp_dir = '';
8028 }
8029
8030 ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE
8031 if ($non_bs_mm){
8032 if ($vanilla){
8033 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n";
8034 }
8035 }
8036
8037 ### PREFIX FOR OUTPUT FILES
8038 if ($prefix){
8039 # removing trailing dots
8040
8041 $prefix =~ s/\.+$//;
8042
8043 warn "Using the following prefix for output files: $prefix\n\n";
8044 sleep(1);
8045 }
8046
8047 if (defined $multicore){
8048 unless ($multicore > 0){
8049 die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n";
8050 }
8051 if ($multicore > 20){
8052 warn "Core usage currently set to more than 20 threads. This might fail horribly but let's see how it goes... (set value: $multicore)\n\n";
8053 }
8054 if ($sam){
8055 die "The multicore function currently requires the output to be in BAM format, so please lose either option --sam or --multi\n";
8056 }
8057 }
8058 else{
8059 $multicore = 1; # default. Single-thread mode
8060 warn "Setting parallelization to single-threaded (default)\n\n";
8061 }
8062
8063 if ($basename and $multicore > 1){
8064 die "Specifying --basename in conjuction with --multicore is currently not supported (but we are aiming to fix this soon). Please lose either --basename or --multicore to proceed\n\n";
8065 }
8066
8067 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index_present,$multicore);
8068 }
8069
8070
8071
8072 sub generate_SAM_header{
8073
8074 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
8075
8076 # Unordered printing of @SQ headers
8077 # foreach my $chr (keys %chromosomes){
8078 # my $length = length ($chromosomes{$chr});
8079 # print "\@SQ\tSN:$chr\tLN:$length\n";
8080 # print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
8081 # }
8082
8083 foreach my $chr (sort {$a<=>$b} keys %SQ_order){
8084 # warn "$chr\t$SQ_order{$chr}\n";
8085 my $length = length ($chromosomes{$SQ_order{$chr}});
8086 print OUT "\@SQ\tSN:$SQ_order{$chr}\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
8087 }
8088
8089 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
8090
8091 }
8092
8093 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
8094 ### O. Tam (2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011), A. Dei Rossi (2014)
8095
8096 sub single_end_SAM_output{
8097
8098 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
8099 my $strand = $methylation_call_params->{$id}->{alignment_strand};
8100 my $chr = $methylation_call_params->{$id}->{chromosome};
8101 my $start = $methylation_call_params->{$id}->{position};
8102 my $stop = $methylation_call_params->{$id}->{end_position};
8103 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
8104 my $methcall = $methylation_call_params->{$id}->{methylation_call};
8105 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
8106 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
8107 my $number_of_mismatches;
8108
8109 if ($bowtie2){
8110 $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score};
8111 }
8112 else{
8113 $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches};
8114 }
8115
8116 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
8117 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
8118 ## Bit Description Comment Value
8119 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
8120 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
8121 ## 0x4 segment unmapped --- ---
8122 ## 0x8 next segment in the template unmapped --- ---
8123 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
8124 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
8125 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
8126 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
8127 ## 0x100 secondary alignment --- ---
8128 ## 0x200 not passing quality controls --- ---
8129 ## 0x400 PCR or optical duplicate --- ---
8130
8131 #####
8132
8133 my $flag; # FLAG variable used for SAM format.
8134 if ($strand eq "+"){
8135 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
8136 $flag = 0; # 0 for "+" strand (OT)
8137 }
8138 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
8139 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
8140 }
8141 else{
8142 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
8143 }
8144 }
8145 elsif ($strand eq "-"){
8146 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
8147 $flag = 16; # 16 for "-" strand (OB)
8148 }
8149 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
8150 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
8151 }
8152 else{
8153 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
8154 }
8155 }
8156 else{
8157 die "Unexpected strand information: $strand\n\n";
8158 }
8159
8160 #####
8161
8162 my $mapq;
8163
8164 if ($bowtie2){
8165 $mapq = $methylation_call_params->{$id}->{mapq};
8166 }
8167 else{
8168 $mapq = 255; # Mapping quality is unavailable for use with Bowtie
8169 }
8170
8171 #####
8172
8173 my $cigar;
8174 if ($bowtie2){
8175 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
8176 }
8177 else{
8178 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
8179 }
8180
8181 #####
8182
8183 my $rnext = "*"; # Paired-end variable
8184
8185 #####
8186
8187 my $pnext = 0; # Paired-end variable
8188
8189 #####
8190
8191 my $tlen = 0; # Paired-end variable
8192
8193 #####
8194
8195 if ($read_conversion eq 'CT'){
8196 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
8197 }
8198 else{
8199 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
8200 }
8201
8202 if ($strand eq '-'){
8203 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
8204 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
8205 if ($cigar =~ /D/){
8206 $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} );
8207 }
8208 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
8209 }
8210
8211 #####
8212
8213 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
8214 # into the reference string. hemming_dist()
8215 if ($bowtie2){
8216 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
8217 }
8218
8219 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
8220
8221 #####
8222
8223 my $MD_tag = make_mismatch_string($actual_seq, $ref_seq,$cigar,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag}); # Optional tag MD: string providing mismatched reference bases in the alignment (this does include indel information)
8224 # my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
8225
8226 #####
8227
8228 my $XM_tag; # Optional tag XM: Methylation Call String
8229 if ($strand eq '+'){
8230 $XM_tag = "XM:Z:$methcall";
8231 }
8232 elsif ($strand eq '-'){
8233 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
8234 }
8235
8236 #####
8237
8238 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
8239
8240 #####
8241
8242 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
8243
8244 #####
8245
8246 # Optionally calculating number of mismatches for Bowtie 2 alignments
8247
8248 if ($non_bs_mm) {
8249 if ($bowtie2) {
8250
8251 $number_of_mismatches =~ s/-//; # removing the minus sign
8252
8253 ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches
8254 if ($cigar =~ /(D|I)/) {
8255 # warn "$cigar\n";
8256
8257 # parsing CIGAR string
8258 my @len = split (/\D+/,$cigar); # storing the length per operation
8259 my @ops = split (/\d+/,$cigar); # storing the operation
8260 shift @ops; # remove the empty first element
8261 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
8262
8263 foreach (0..$#len) {
8264 if ($ops[$_] eq 'M') {
8265 # warn "skipping\n";
8266 next; # irrelevant
8267 }
8268 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
8269 $number_of_mismatches -= $insertion_open;
8270 $number_of_mismatches -= $len[$_] * $insertion_extend;
8271 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
8272 }
8273 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
8274 $number_of_mismatches -= $deletion_open;
8275 $number_of_mismatches -= $len[$_] * $deletion_extend;
8276 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
8277 }
8278 elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
8279 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
8280 }
8281 else {
8282 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
8283 }
8284 }
8285 # warn "Alignment score $number_of_mismatches\n";
8286 # print "Mismatches $number_of_mismatches\n\n";
8287 }
8288 ### Now we have InDel corrected alignment scores
8289
8290 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
8291 ### sequence contained more than 5 Ns, but this should occur close to never
8292
8293 my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division
8294 # warn "N count: $seq_N_count\n";
8295 $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count;
8296 # warn "MM $number_of_mismatches\n";
8297 }
8298 }
8299
8300 ####
8301
8302 my $XA_tag = "XA:Z:$number_of_mismatches";
8303
8304 #####
8305
8306 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
8307 ### optionally print number of non-bisulfite mismatches
8308 if ($non_bs_mm){
8309 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n";
8310 }
8311 else{ # default
8312 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
8313 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
8314 }
8315 }
8316
8317 sub paired_end_SAM_output{
8318
8319 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
8320 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
8321 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
8322 my $chr = $methylation_call_params->{$id}->{chromosome};
8323 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
8324 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
8325 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
8326 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
8327 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
8328 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
8329 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
8330
8331 my $id_1;
8332 my $id_2;
8333
8334 if ($old_flag){
8335 $id_1 = $id.'/1';
8336 $id_2 = $id.'/2';
8337 }
8338 else{
8339 $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard
8340 $id_2 = $id;
8341 }
8342
8343 # Allows all degenerate nucleotide sequences in reference genome
8344 # die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHVX]/i; # X are padded nucleotides in case of insertions in the read
8345 # die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHVX]/i;
8346
8347 my $index; # used to store the srand origin of the alignment in a less convoluted way
8348
8349 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
8350 $index = 0; ## this is OT (original top strand)
8351 }
8352 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
8353 $index = 1; ## this is CTOB (complementary to OB)
8354 }
8355 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
8356 $index = 2; ## this is CTOT (complementary to OT)
8357 }
8358 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
8359 $index = 3; ## this is OB (original bottom)
8360 }
8361 else {
8362 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
8363 }
8364
8365 my $number_of_mismatches_1;
8366 my $number_of_mismatches_2;
8367
8368 if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine
8369 $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default!
8370 $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2};
8371 }
8372 else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation
8373 if ($index == 2 or $index == 3){ # CTOT or OB
8374 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default!
8375 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1};
8376 }
8377 else{ # if the first read aligned in forward direction it is like for Bowtie 2
8378 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
8379 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
8380 }
8381 }
8382
8383
8384
8385 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
8386 ### first or last position.
8387
8388 if ($index == 0 or $index == 3){ # OT or OB
8389 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
8390 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
8391 }
8392 else{ # CTOT or CTOB
8393 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
8394 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
8395 }
8396
8397 #####
8398
8399 my $start_read_1;
8400 my $start_read_2;
8401 # adjusting end positions
8402
8403 if ($bowtie2){
8404 $start_read_1 = $methylation_call_params->{$id}->{position_1};
8405 $start_read_2 = $methylation_call_params->{$id}->{position_2};
8406 }
8407 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
8408 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
8409 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
8410 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
8411 }
8412 else{ # read 1 is on the - strand
8413 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
8414 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
8415 }
8416 }
8417
8418 #####
8419
8420 my $end_read_1;
8421 my $end_read_2;
8422 # adjusting end positions
8423
8424 if ($bowtie2){
8425 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
8426 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
8427 }
8428 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
8429 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
8430 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
8431 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
8432 }
8433 else{
8434 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
8435 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
8436 }
8437 }
8438
8439 #####
8440
8441 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
8442 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
8443 ## Bit Description Comment Value
8444 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
8445 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
8446 ## 0x4 segment unmapped --- ---
8447 ## 0x8 next segment in the template unmapped --- ---
8448 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
8449 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
8450 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
8451 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
8452 ## 0x100 secondary alignment --- ---
8453 ## 0x200 not passing quality controls --- ---
8454 ## 0x400 PCR or optical duplicate --- ---
8455
8456 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
8457
8458 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
8459 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
8460
8461 my $flag_1; # FLAG variable used for SAM format
8462 my $flag_2;
8463
8464 ### The new default FLAG values have been suggested by Peter Hickey, Australia
8465
8466 if ($index == 0){ # OT
8467 unless ($old_flag){
8468 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
8469 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
8470 }
8471 else{
8472 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
8473 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
8474 }
8475 }
8476 elsif ($index == 1){ # CTOB
8477 unless($old_flag){
8478 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
8479 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
8480 }
8481 else{
8482 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
8483 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
8484 }
8485 }
8486 elsif ($index == 2){ # CTOT
8487 unless ($old_flag){
8488 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64)
8489 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128)
8490 }
8491 else{
8492 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
8493 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
8494 }
8495 }
8496 elsif ($index == 3){ # OB
8497 unless ($old_flag){
8498 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64)
8499 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128)
8500 }
8501 else{
8502 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
8503 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
8504 }
8505 }
8506
8507 #####
8508
8509 my $mapq;
8510
8511 if ($bowtie2){
8512 $mapq = $methylation_call_params->{$id}->{mapq};
8513 }
8514 else{
8515 $mapq = 255; # Mapping quality is unavailable for use with Bowtie
8516 }
8517
8518 #####
8519
8520 my $cigar_1;
8521 my $cigar_2;
8522
8523 if ($bowtie2){
8524 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
8525 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
8526 }
8527 else{
8528 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
8529 $cigar_2 = length($actual_seq_2) . "M";
8530 }
8531
8532 #####
8533
8534 my $rnext = '='; # Chromosome of mate; applies to both reads
8535
8536 #####
8537
8538 my $pnext_1 = $start_read_2; # Leftmost position of mate
8539 my $pnext_2 = $start_read_1;
8540
8541 #####
8542
8543 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
8544 my $tlen_2;
8545
8546 if ($bowtie2){
8547
8548 if ($start_read_1 <= $start_read_2){
8549
8550 # Read 1 alignment is leftmost
8551
8552 if ($end_read_2 >= $end_read_1){
8553
8554 # -------------------------> read 1 reads overlapping
8555 # <------------------------- read 2
8556 #
8557 # or
8558 #
8559 # -------------------------> read 1
8560 # <----------------------- read 2 read 2 contained within read 1
8561 #
8562 # or
8563 #
8564 # -------------------------> read 1 reads 1 and 2 exactly overlapping
8565 # <------------------------- read 2
8566 #
8567
8568 # dovetailing of reads is not enabled for Bowtie 2 alignments
8569
8570 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
8571 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
8572 }
8573 elsif ($end_read_2 < $end_read_1){
8574
8575 # -------------------------> read 1
8576 # <----------- read 2 read 2 contained within read 1
8577 #
8578 # or
8579 #
8580 # -------------------------> read 1
8581 # <------------------------ read 2 read 2 contained within read 1
8582
8583 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable
8584 $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign,
8585 $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this
8586 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
8587 }
8588
8589 }
8590
8591 elsif ($start_read_2 < $start_read_1){
8592
8593 if ($end_read_1 >= $end_read_2){
8594
8595 # Read 2 alignment is leftmost
8596
8597 # -------------------------> read 2 reads overlapping
8598 # <------------------------- read 1
8599 #
8600 # or
8601 #
8602 # -------------------------> read 2
8603 # <----------------------- read 1 read 1 contained within read 2
8604 #
8605 #
8606
8607 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
8608 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
8609 }
8610 elsif ($end_read_1 < $end_read_2){
8611
8612 # -------------------------> read 2
8613 # <----------- read 1 read 1 contained within read 2
8614 #
8615 # or
8616 #
8617 # -------------------------> read 2
8618 # <------------------------ read 1 read 1 contained within read 2
8619
8620 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable
8621 $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign,
8622 $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this
8623 ### as a request by frozenlyse on SeqAnswers on 24 July 2013
8624 }
8625 }
8626 }
8627
8628 else{ # Bowtie 1
8629
8630 if ($end_read_2 >= $end_read_1){
8631 # Read 1 alignment is leftmost
8632 # -------------------------> read 1
8633 # <------------------------- read 2
8634 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
8635
8636 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
8637 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
8638 }
8639 else{
8640 # Read 2 alignment is leftmost
8641 # -------------------------> read 2
8642 # <------------------------- read 1
8643 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
8644
8645 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
8646 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
8647 }
8648 }
8649
8650 #####
8651
8652 # adjusting the strand of the sequence before we use them to generate mismatch strings
8653 if ($strand_1 eq '-'){
8654 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
8655 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
8656 if ($cigar_1 =~ /D/){
8657 $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} );
8658 }
8659 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
8660 }
8661 if ($strand_2 eq '-'){
8662 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
8663 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
8664 if ($cigar_2 =~ /D/){
8665 $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} );
8666 }
8667 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
8668 }
8669
8670 # print "$actual_seq_1\n$ref_seq_1\n\n";
8671 # print "$actual_seq_2\n$ref_seq_2\n\n";
8672
8673 #####
8674
8675 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
8676 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
8677 if ($bowtie2){
8678 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
8679 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
8680 }
8681 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
8682 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
8683
8684 #####
8685
8686 my $MD_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1,$cigar_1,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1}); # Optional tag MD: String providing mismatched reference bases in the alignment (including indel information)
8687 my $MD_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2,$cigar_2,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2});
8688
8689 # my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
8690 # my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
8691
8692 #####
8693
8694 my $XM_tag_1; # Optional tag XM: Methylation call string
8695 my $XM_tag_2;
8696
8697 if ($strand_1 eq '-'){
8698 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
8699 }
8700 else{
8701 $XM_tag_1 = "XM:Z:$methcall_1";
8702 }
8703
8704 if ($strand_2 eq '-'){
8705 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
8706 }
8707 else{
8708 $XM_tag_2 = "XM:Z:$methcall_2";
8709 }
8710
8711 #####
8712
8713 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
8714 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
8715
8716 #####
8717
8718 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
8719
8720 #####
8721
8722 # Optionally calculating number of mismatches for Bowtie 2 alignments
8723
8724 if ($non_bs_mm) {
8725 if ($bowtie2) {
8726
8727 $number_of_mismatches_1 =~ s/-//; # removing the minus sign
8728 $number_of_mismatches_2 =~ s/-//;
8729
8730 ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches
8731
8732 ### CIGAR 1
8733 if ($cigar_1 =~ /(D|I)/) {
8734 # warn "$cigar_1\n";
8735
8736 # parsing CIGAR string
8737 my @len = split (/\D+/,$cigar_1); # storing the length per operation
8738 my @ops = split (/\d+/,$cigar_1); # storing the operation
8739 shift @ops; # remove the empty first element
8740 die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
8741
8742 foreach (0..$#len) {
8743 if ($ops[$_] eq 'M') {
8744 # warn "skipping\n";
8745 next; # irrelevant
8746 }
8747 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
8748 $number_of_mismatches_1 -= $insertion_open;
8749 $number_of_mismatches_1 -= $len[$_] * $insertion_extend;
8750 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
8751 }
8752 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
8753 $number_of_mismatches_1 -= $deletion_open;
8754 $number_of_mismatches_1 -= $len[$_] * $deletion_extend;
8755 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
8756 }
8757 elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
8758 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
8759 }
8760 else {
8761 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
8762 }
8763 }
8764
8765 # warn "Alignment score $number_of_mismatches_1\n";
8766 # print "Mismatches $number_of_mismatches_1\n\n";
8767 }
8768
8769 ### CIGAR 2
8770 if ($cigar_2 =~ /(D|I)/) {
8771 # warn "$cigar_2\n";
8772
8773 # parsing CIGAR string
8774 my @len = split (/\D+/,$cigar_2); # storing the length per operation
8775 my @ops = split (/\d+/,$cigar_2); # storing the operation
8776 shift @ops; # remove the empty first element
8777 die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
8778
8779 foreach (0..$#len) {
8780 if ($ops[$_] eq 'M') {
8781 # warn "skipping\n";
8782 next; #irrelevant
8783 }
8784 elsif ($ops[$_] eq 'I') { # insertion in the read sequence
8785 $number_of_mismatches_2 -= $insertion_open;
8786 $number_of_mismatches_2 -= $len[$_] * $insertion_extend;
8787 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n";
8788 }
8789 elsif ($ops[$_] eq 'D') { # deletion in the read sequence
8790 $number_of_mismatches_2 -= $deletion_open;
8791 $number_of_mismatches_2 -= $len[$_] * $deletion_extend;
8792 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n";
8793 }
8794 elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die
8795 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
8796 }
8797 else {
8798 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
8799 }
8800 }
8801 }
8802
8803 ### Now we have InDel corrected Alignment scores
8804
8805 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the
8806 ### sequence contained more than 5 Ns, but this should occur close to never
8807
8808 my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division
8809 my $seq_2_N_count = $number_of_mismatches_2 % 6;
8810 # warn "N count 1: $seq_1_N_count\n";
8811 # warn "N count 2: $seq_2_N_count\n";
8812
8813 $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count;
8814 $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count;
8815
8816 # warn "MM1 $number_of_mismatches_1 \n";
8817 # warn "MM2 $number_of_mismatches_2 \n";
8818 }
8819 }
8820
8821 ####
8822
8823 my $XA_tag = "XA:Z:$number_of_mismatches_1";
8824 my $XB_tag = "XB:Z:$number_of_mismatches_2";
8825
8826
8827 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
8828 ### optionally print number of non-bisulfite mismatches
8829 if ($non_bs_mm){
8830 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n";
8831 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n";
8832 }
8833 else{ # default
8834 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
8835 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
8836 }
8837 }
8838
8839 sub revcomp{
8840 my $seq = shift or die "Missing seq to reverse complement\n";
8841 $seq = reverse $seq;
8842 $seq =~ tr/ACTGactg/TGACTGAC/;
8843 return $seq;
8844 }
8845
8846 sub hemming_dist{
8847 my $matches = 0;
8848 my @actual_seq = split //,(shift @_);
8849 my @ref_seq = split //,(shift @_);
8850
8851 foreach (0..$#actual_seq){
8852 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
8853 }
8854 return my $hd = scalar @actual_seq - $matches;
8855 }
8856
8857
8858 ### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might
8859 ### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels
8860 ### 05 June 2014
8861
8862
8863 sub make_mismatch_string{
8864 my ($actual_seq,$ref_seq,$cigar,$md_sequence) = @_;
8865
8866 my $MD_tag = "MD:Z:";
8867 my $prev_matching = 0;
8868 my $last_char;
8869
8870 my $ref_base;
8871 my $actual_base;
8872
8873 foreach my $pos ( 0..(length$actual_seq) - 1 ){
8874
8875 $actual_base = substr($actual_seq,$pos,1);
8876 $ref_base = substr($ref_seq,$pos,1);
8877 # if ($verbose){ warn "reference: $ref_base\tseen base: $actual_base\n";}
8878
8879 if ( $actual_base eq $ref_base ){
8880 ++$prev_matching;
8881 }
8882 else{
8883 # If the mismatch is due to an insertion we simply move on, else we print the previously matching bases as well as the mismatching genomic base
8884 if ($ref_base eq 'X'){
8885 # if ($verbose){ warn "The genome base was an artificually padded '$ref_base' due to an insertion in the read at this position. Just ignoring it for the MD tag\n"; sleep(1);}
8886 }
8887 else{
8888 # if ($verbose){ warn "previous matching bases: $prev_matching\n";}
8889
8890 ### There is a mismatch between the sequence and the genome. First we need to write out how may bases matched until now
8891 if ($prev_matching == 0){
8892 # if ($verbose){ warn "Got a mismatch either at the very start or next to another mismatch. Need to add a padding 0 as well as the mismatch\n";}
8893 # if ($verbose){ warn "${prev_matching}$ref_base\n";}
8894 $MD_tag .= $prev_matching;
8895 $MD_tag .= $ref_base;
8896 }
8897 else{
8898 # if ($verbose){ warn "${prev_matching}$ref_base\n";}
8899 $MD_tag .= $prev_matching;
8900 $MD_tag .= $ref_base;
8901 }
8902
8903 $prev_matching = 0; # resetting $prev_matching
8904 }
8905
8906 }
8907
8908 }
8909 ### appending the number of matches one last time
8910 $MD_tag .= $prev_matching;
8911
8912
8913 ### If the read contains deletion(s) we need to take care of these in the MD-tag as well
8914 if ($cigar =~ /D/){
8915 my $deletions_total = 0;
8916 while ($cigar =~ /D/g){
8917 ++$deletions_total;
8918 }
8919 if ($verbose){ warn "Read contains $deletions_total deletions in total\n\n";}
8920
8921 if ($verbose){ warn "There was a deletion in the read!\n";}
8922 if ($verbose){ warn "actual:\t$actual_seq\nref:\t$ref_seq\nMD-seq:\t$md_sequence\nMD-tag: $MD_tag\n";}
8923
8924 # parsing CIGAR string
8925 my @len = split (/\D+/,$cigar); # storing the length per operation
8926 my @ops = split (/\d+/,$cigar); # storing the operation
8927 shift @ops; # remove the empty first element
8928 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
8929
8930 my $MD_pos_so_far = 0;
8931 my $deletions_processed = 0;
8932 my $del_pos = 0;
8933 my $deleted_bases = '';
8934 my $new_MD = $1 if ($MD_tag =~ /MD:Z:(.*)/);
8935 my $md_index_already_processed;
8936
8937 my @md = split //,$new_MD;
8938
8939 if ($verbose){ warn "New MD-tag: $new_MD\n\n";}
8940 $MD_tag = "MD:Z:"; ### reconstituting a new MD-tag
8941 $new_MD = ''; # using this to build up a new string that will replace the old \@md
8942
8943 if ($verbose){ warn "CIGAR string; $cigar\n";}
8944 ### determining end position of a read
8945 foreach my $index(0..$#len){
8946
8947 if ($ops[$index] eq 'M'){ # matching bases
8948 $del_pos += $len[$index];
8949 if ($verbose){ warn "Operation is 'M', adding $len[$index] bp\n";}
8950 }
8951 elsif($ops[$index] eq 'I'){ # insertion
8952 $del_pos += $len[$index];
8953 ### need to add insertions in the read to MD pos so far!
8954 $MD_pos_so_far += $len[$index];
8955 if ($verbose){ warn "Operation is 'I', adding $len[$index] bp\n";}
8956 }
8957 elsif($ops[$index] eq 'D'){ # deletion
8958 if ($verbose){ warn "Operation is 'D', extracting $len[$index] bp\n";}
8959 $deleted_bases = substr($md_sequence,$del_pos,$len[$index]);
8960 if ($verbose){ warn "Deleted bases: $deleted_bases\n\n";}
8961
8962 ### Now we need to process the MD-tag so far and write out everything up until this point, inlcuding the deletion
8963 if ($verbose){ warn "Now processing the MD-tag\n";}
8964 my $op;
8965
8966 my $this_deletion_processed;
8967 my $md_processed_so_far;
8968 my $current_md_index;
8969
8970 foreach my $el (@md){
8971
8972 unless (defined $current_md_index){
8973 $current_md_index = 0; # first element = index 0
8974 }
8975 else{
8976 ++$current_md_index;
8977 }
8978
8979 if ($md_index_already_processed and ($current_md_index <= $md_index_already_processed)){
8980 if ($verbose){ warn "This has to be another deletion within the same read. Currently processing index $current_md_index, but have already processed $md_index_already_processed indexes previously\n";}
8981 $new_MD .= $el;
8982 next;
8983 }
8984
8985 if ($verbose){ warn "Current element: $el\n";}
8986 unless (defined $op){ # initialize
8987 $op = $el;
8988 if ($verbose){ warn "Initializing \$op as $op\n";}
8989 next;
8990 }
8991
8992 if ($deletions_processed == $deletions_total){
8993 if ($verbose){ warn "Processed $deletions_processed in the read so far, out of $deletions_total total. Just appending elements until the end of the string: here $el\n";}
8994 $MD_tag .= $el;
8995 $new_MD .= $el;
8996 next;
8997 }
8998 # this only occurs when there are more deletions in the read but we want to regenerate a new MD tag
8999 if ($this_deletion_processed){
9000 $new_MD .= $el;
9001 next;
9002 }
9003
9004 if ($op =~ /^\d+$/){
9005 if ($verbose){ warn "Operation so far was a digit: $op\n";}
9006 if ($el =~ /\d/){
9007 $op .= $el;
9008 if ($verbose){ warn "Appending current operation $el. New operation is: $op\n";}
9009 next;
9010 }
9011 else{
9012 if ($verbose){ warn "current element is a word character: $el\n";}
9013
9014 ### Need to determine if the matching operation length includes the deletion position
9015 if ($verbose){ warn "Processing operation $op and adding it to MD pos which is so far: $MD_pos_so_far; deletion pos is $del_pos.\n";}
9016 $MD_pos_so_far += $op;
9017 if ($verbose){ warn "MD pos so far: $MD_pos_so_far\n";}
9018 if ($MD_pos_so_far < $del_pos){
9019 if ($verbose){ warn "Doesn't cover the deletion yet. Writing back out.\n";}
9020 $MD_tag .= $op;
9021 $new_MD .= $op;
9022 if ($verbose){ warn "Setting new operation to: $el\n";}
9023 $op = $el; # setting new $op
9024 }
9025 else{
9026 if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";}
9027 ### splitting up the number of matching bases in number before and after the deletion
9028
9029 my $pos_after_deletion = $MD_pos_so_far - $del_pos;
9030 my $pos_before_deletion = $op - $pos_after_deletion;
9031 if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";}
9032 $MD_tag .= "${pos_before_deletion}^${deleted_bases}";
9033 $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}";
9034 if ($verbose){ warn "\$newMD after adjusting for the current deletion: $new_MD\n";}
9035
9036 #adjusting the MD_position by the number of bases after the deletion
9037 $MD_pos_so_far -= $pos_after_deletion;
9038 if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; }
9039 ### also appending the current element because we are writing out the rest of the MD-string unchanged to $new_MD
9040 $new_MD .= $el;
9041
9042 $deletions_processed += 1;
9043 $this_deletion_processed = 1;
9044
9045 if ($deletions_processed == $deletions_total){ # this was the last deletion of the read
9046 if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";}
9047 $MD_tag .= "${pos_after_deletion}";
9048
9049 ### also appending the current element because we are writing out the rest of the MD-string unchanged
9050 if ($verbose){ warn "also appending the current element $el\n";}
9051 $MD_tag .= $el;
9052 ### Finally also adding the length of the deletion to $del_pos
9053 $del_pos += $len[$index];
9054 if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";}
9055 }
9056 else{
9057 if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";}
9058 if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";}
9059
9060
9061 ### This migh need looking at!!
9062
9063 $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op;
9064 if ($verbose){ warn "Current index = $current_md_index\n";}
9065
9066 if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";}
9067 $md_index_already_processed = $current_md_index - 1;
9068
9069 if ($verbose){ warn "Exiting now and waiting for the next deletion\n";}
9070
9071 ### Finally also adding the length of the deletion to $del_pos
9072 $del_pos += $len[$index];
9073 $MD_pos_so_far += $len[$index];
9074 if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";}
9075 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";}
9076 #setting $op to en empty string so it is not being processed as the last element
9077 $op = '';
9078 # last; # exiting the loop and processing the CIGAR string further until we hit the next deletion
9079 }
9080 }
9081 }
9082 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";}
9083 }
9084 else{
9085 if ($verbose){ warn "Operation so far was a word character: $op\n";}
9086 if ($el =~ /\d+/){
9087 # processing the previous mismatch position
9088 $MD_tag .= $op;
9089 $new_MD .= $op;
9090 $MD_pos_so_far += length($op);
9091 if ($verbose){ warn "Writing out mismatching base $op and adding length ",length($op),"\n";}
9092 }
9093 else{
9094 # this should never occur since mismatches are followed by a 0 or another digit
9095 die "current element is a another word character: $el. This should never happen!\n";
9096 }
9097 if ($verbose){ warn "Setting new operation to: $el\n";}
9098 $op = $el; # setting new $op
9099 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";}
9100 }
9101 }
9102
9103 ### need to consider last element if it was a digit or number and we are expecting the deletion in the last element of the MD-tag
9104 if ($op =~ /\d+/ and $deletions_processed < $deletions_total){
9105 if ($verbose){ warn "\n\nlast operation was $op\n";}
9106 if ($verbose){ warn "Processing operation $op; deletion pos is $del_pos. MD so far was: $MD_pos_so_far\n";}
9107
9108 $MD_pos_so_far += $op;
9109 if ($verbose){ warn "Adding $op to MD pos so far: $MD_pos_so_far\n";}
9110 if ($verbose){ warn "Deletions already processed: $deletions_processed, del total: $deletions_total\n\n";}
9111 if ($MD_pos_so_far >= $del_pos){
9112 if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";}
9113 ### splitting up the number of matching bases in number before and after the deletion
9114
9115 my $pos_after_deletion = $MD_pos_so_far - $del_pos;
9116 my $pos_before_deletion = $op - $pos_after_deletion;
9117 if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";}
9118
9119 $MD_tag .= "${pos_before_deletion}^${deleted_bases}";
9120 $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}";
9121
9122 #adjusting the MD_position by the number of bases after the deletion
9123 $MD_pos_so_far -= $pos_after_deletion;
9124 if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; }
9125
9126 $deletions_processed += 1;
9127 $this_deletion_processed = 1;
9128
9129 if ($deletions_processed == $deletions_total){ # this was the last deletion of the read
9130 if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";}
9131 $MD_tag .= "${pos_after_deletion}";
9132
9133 }
9134 else{
9135 if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";}
9136 if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";}
9137
9138 $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op;
9139 if ($verbose){ warn "Current index = $current_md_index\n";}
9140
9141 if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";}
9142 # since we are no longer in the loop we don't have to subtract 1 from $current_md_index (tit hasn't been incremented in the first place...)
9143 $md_index_already_processed = $current_md_index;
9144
9145 if ($verbose){ warn "Exiting now and waiting for the next deletion\n";}
9146
9147 $MD_pos_so_far += $len[$index];
9148 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";}
9149 }
9150 ### Finally also adding the length of the deletion to $del_pos
9151 $del_pos += $len[$index];
9152 if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";}
9153 }
9154 else{
9155 die "Something went wrong, we haven't seen a deletion so far even though we should have...\n\n";
9156 }
9157 }
9158
9159 # forming a new @md
9160 @md = split //,$new_MD;
9161 $new_MD = '';
9162 if ($verbose){ warn "New \@md array: @md\n\n";}
9163 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\nnew_MD so far: $new_MD\n\n";}
9164
9165 }
9166 else{
9167 die "Found CIGAR operations other than M, I, D or N: '$ops[$index]'. Not allowed at the moment\n";
9168 }
9169 }
9170
9171 }
9172 if ($verbose){ warn "Returning MD-tag: $MD_tag\n";}
9173 return $MD_tag;
9174
9175 }
9176
9177 ### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might
9178 ### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels
9179 # sub make_mismatch_string{
9180 # my $actual_seq = shift or die "Missing actual sequence\n";
9181 # my $ref_seq = shift or die "Missing reference sequence\n";
9182 # my $XX_tag = "XX:Z:";
9183
9184 # my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
9185
9186 # warn "'$tmp'\n"; sleep(1);
9187 # my $prev_mm_pos = 0;
9188
9189 # while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
9190 # my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
9191 # my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
9192 # $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
9193 # $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
9194 # $prev_mm_pos = pos($tmp); # Position of last mismatch
9195 # }
9196 # my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
9197 # $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
9198 # return $XX_tag;
9199 # }
9200
9201
9202
9203 sub print_helpfile{
9204 print << "HOW_TO";
9205
9206
9207 This program is free software: you can redistribute it and/or modify
9208 it under the terms of the GNU General Public License as published by
9209 the Free Software Foundation, either version 3 of the License, or
9210 (at your option) any later version.
9211
9212 This program is distributed in the hope that it will be useful,
9213 but WITHOUT ANY WARRANTY; without even the implied warranty of
9214 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9215 GNU General Public License for more details.
9216 You should have received a copy of the GNU General Public License
9217 along with this program. If not, see <http://www.gnu.org/licenses/>.
9218
9219
9220
9221 DESCRIPTION
9222
9223
9224 The following is a brief description of command line options and arguments to control the Bismark
9225 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
9226 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
9227 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
9228 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
9229 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
9230 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
9231 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
9232 sequence from the genome and determine if there were any protected C's present or not.
9233
9234 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
9235 re-enabled by using --non_directional.
9236
9237 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
9238 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
9239 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
9240
9241
9242 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
9243
9244
9245 ARGUMENTS:
9246
9247 <genome_folder> The path to the folder containing the unmodified reference genome
9248 as well as the subfolders created by the Bismark_Genome_Preparation
9249 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
9250 Bismark expects one or more fastA files in this folder (file extension: .fa
9251 or .fasta). The path can be relative or absolute.
9252
9253 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
9254 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
9255 correspond file-for-file and read-for-read with those specified in <mates2>.
9256 Reads may be a mix of different lengths. Bismark will produce one mapping result
9257 and one report file per paired-end input file pair.
9258
9259 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
9260 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
9261 correspond file-for-file and read-for-read with those specified in <mates1>.
9262 Reads may be a mix of different lengths.
9263
9264 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
9265 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
9266 produce one mapping result and one report file per input file.
9267
9268
9269 OPTIONS:
9270
9271
9272 Input:
9273
9274 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
9275 files (usually having extension .fg or .fastq). This is the default. See also
9276 --solexa-quals.
9277
9278 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
9279 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
9280 are assumed to be 40 on the Phred scale. FASTA files are expected to contain both
9281 the read name and the sequence on a single line (and not spread over several lines).
9282
9283 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
9284
9285 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
9286
9287 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
9288
9289 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
9290
9291 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
9292 (which can't). The formula for conversion is:
9293 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
9294 is usually the right option for use with (unconverted) reads emitted by the GA
9295 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
9296
9297 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
9298 reads emitted by GA Pipeline version 1.3 or later. Default: off.
9299
9300 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
9301 specified it is assumed that Bowtie (1 or 2) is in the PATH.
9302
9303
9304 Alignment:
9305
9306 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
9307 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
9308 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
9309
9310 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
9311 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
9312 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
9313
9314 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
9315 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
9316 quality values to the nearest 10 and saturates at 30. This value is not relevant for
9317 Bowtie 2.
9318
9319 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
9320 --best mode. Best-first search must keep track of many paths at once to ensure it is
9321 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
9322 memory impact of the descriptors, but they can still grow very large in some cases. If
9323 you receive an error message saying that chunk memory has been exhausted in --best mode,
9324 try adjusting this parameter up to dedicate more memory to the descriptors. This value
9325 is not relevant for Bowtie 2. Default: 512.
9326
9327 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
9328 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
9329 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
9330 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
9331
9332 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
9333 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
9334 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
9335 A 61-bp gap would not be valid in that case. Default: 500.
9336
9337 --multicore <int> Sets the number of parallel instances of Bismark to be run concurrently. This forks the
9338 Bismark alignment step very early on so that each individual Spawn of Bismark processes
9339 only every n-th sequence (n being set by --multicore). Once all processes have completed,
9340 the individual BAM files, mapping reports, unmapped or ambiguous FastQ files are merged
9341 into single files in very much the same way as they would have been generated running Bismark
9342 conventionally with only a single instance.
9343
9344 If system resources are plentiful this is a viable option to speed up the alignment process
9345 (we observed a near linear speed increase for up to --multicore 8 tested). However, please note
9346 that a typical Bismark run will use several cores already (Bismark itself, 2 or 4 threads of
9347 Bowtie/Bowtie2, Samtools, gzip etc...) and ~10-16GB of memory depending on the choice of aligner
9348 and genome. WARNING: Bismark Parallel (BP?) is resource hungry! Each value of --multicore specified
9349 will effectively lead to a linear increase in compute and memory requirements, so --multicore 4 for
9350 e.g. the GRCm38 mouse genome will probably use ~20 cores and eat ~40GB or RAM, but at the same time
9351 reduce the alignment time to ~25-30%. You have been warned.
9352
9353
9354
9355 Bowtie 1 Reporting:
9356
9357 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
9358 will be used by default.
9359
9360 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
9361 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
9362 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
9363 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
9364 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
9365 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
9366 the best alignment). --best mode also removes all strand bias. Note that --best does not
9367 affect which alignments are considered "valid" by Bowtie, only which valid alignments
9368 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
9369 Default: on.
9370
9371 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
9372 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
9373
9374
9375 Output:
9376
9377 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
9378 bisulfite strands will be reported. Default: OFF.
9379
9380 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
9381 to the original strands are merely theoretical and should not exist in reality. Specifying directional
9382 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
9383 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
9384 for sprand-specific libraries).
9385
9386 --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al.,
9387 PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode,
9388 as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT
9389 and OB ones. Use this option only if you are certain that your libraries were constructed following
9390 a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option
9391 --pbat works only for FastQ files (in both Bowtie and Bowtie 2 mode) and using uncompressed
9392 temporary files only).
9393
9394 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
9395 split up into several smaller files to run concurrently and the output files are to be merged.
9396
9397 --quiet Print nothing besides alignments.
9398
9399 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
9400 of SAM format output.
9401
9402 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
9403 appear as they did in the input, without any translation of quality values that may have
9404 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
9405 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
9406 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
9407 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
9408
9409 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
9410 mismatches or other reads that fail to align uniquely to a file in the output directory.
9411 Written reads will appear as they did in the input, without any of the translation of quality
9412 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
9413 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
9414 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
9415
9416 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
9417 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
9418 to create it first. The path to the output folder can be either relative or absolute.
9419
9420 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
9421 the specified folder does not exist, Bismark will attempt to create it first. The path to the
9422 temporary folder can be either relative or absolute.
9423
9424 --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the
9425 alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is
9426 just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions.
9427 The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches'
9428 and 'XB:Z:number of mismatches' for read 2 of paired-end reads.
9429
9430 --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk
9431 space. This option is available for most alignment modes but is not available for paired-end FastA
9432 files. This option might be somewhat slower than writing out uncompressed files, but this awaits
9433 further testing.
9434
9435 --sam The output will be written out in SAM format instead of the default BAM format. Bismark will
9436 attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't
9437 been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,
9438 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file).
9439
9440 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified
9441 explicitly if Samtools is in the PATH already.
9442
9443 --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For
9444 example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc.
9445
9446 -B/--basename <basename> Write all output to files starting with this base file name. For example, '--basename foo'
9447 would result in the files 'foo.sam' and 'foo_SE_report.txt' (or its paired-end equivalent). Takes
9448 precedence over --prefix.
9449
9450 --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition,
9451 this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since
9452 both the appended read IDs and custom FLAG values may cause problems with some downstream tools
9453 such as Picard, new defaults were implemented as of version 0.8.3.
9454
9455
9456 default old_flag
9457 =================== ===================
9458 Read 1 Read 2 Read 1 Read 2
9459
9460 OT: 99 147 67 131
9461
9462 OB: 83 163 115 179
9463
9464 CTOT: 99 147 67 131
9465
9466 CTOB: 83 163 115 179
9467
9468
9469 Other:
9470
9471 -h/--help Displays this help file.
9472
9473 -v/--version Displays version information.
9474
9475
9476 BOWTIE 2 SPECIFIC OPTIONS
9477
9478 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
9479 alignments, i.e. searches for alignments involving all read characters (also called
9480 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
9481 and/or quality trimmed where appropriate. Both small (.bt2) and large (.bt2l) Bowtie 2
9482 indexes are supported. Default: off.
9483
9484 Bowtie 2 alignment options:
9485
9486 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
9487 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
9488 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
9489 Bowtie 1 see -n).
9490
9491 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
9492 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
9493 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
9494 Bowtie 1 see -l).
9495
9496 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
9497 position to be the highest possible, regardless of the actual value. I.e. input is treated
9498 as though all quality values are high. This is also the default behavior when the input
9499 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
9500
9501
9502 Bowtie 2 paired-end options:
9503
9504 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
9505 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
9506 and on by default.
9507
9508 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
9509 A discordant alignment is an alignment where both mates align uniquely, but that does not
9510 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
9511 and it is on by default.
9512
9513
9514 Bowtie 2 effort options:
9515
9516 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
9517 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
9518 new second-best alignment. Default: 15.
9519
9520 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
9521 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
9522 mismatches allowed) at different offsets and searches for more alignments. A read is considered
9523 to have repetitive seeds if the total number of seed hits divided by the number of seeds
9524 that aligned at least once is greater than 300. Default: 2.
9525
9526 Bowtie 2 parallelization options:
9527
9528
9529 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
9530 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
9531 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
9532 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
9533 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
9534 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
9535 automatically use the option '--reorder', which guarantees that output SAM records are printed in
9536 an order corresponding to the order of the reads in the original input file, even when -p is set
9537 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
9538 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
9539 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
9540 correspond to input order in that case.
9541
9542 Bowtie 2 Scoring options:
9543
9544 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
9545 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
9546 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
9547 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
9548 L,0,-0.2.
9549
9550 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty
9551 of <int1> + N * <int2>. Default: 5, 3.
9552
9553 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets
9554 a penalty of <int1> + N * <int2>. Default: 5, 3.
9555
9556
9557 Bowtie 2 Reporting options:
9558
9559 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
9560 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
9561 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
9562 effort expended to find valid alignments.
9563
9564 For reference, this used to be the old (now deprecated) description of -M:
9565 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
9566 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
9567 happens first. Only the best alignment is reported. Information from the other alignments is used to
9568 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
9569 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
9570 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
9571 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
9572 always used and its default value is set to 10.
9573
9574
9575 'VANILLA' Bismark OUTPUT:
9576
9577 Single-end output format (tab-separated):
9578
9579 (1) <seq-ID>
9580 (2) <read alignment strand>
9581 (3) <chromosome>
9582 (4) <start position>
9583 (5) <end position>
9584 (6) <observed bisulfite sequence>
9585 (7) <equivalent genomic sequence>
9586 (8) <methylation call>
9587 (9) <read conversion
9588 (10) <genome conversion>
9589 (11) <read quality score (Phred33)>
9590
9591
9592 Paired-end output format (tab-separated):
9593 (1) <seq-ID>
9594 (2) <read 1 alignment strand>
9595 (3) <chromosome>
9596 (4) <start position>
9597 (5) <end position>
9598 (6) <observed bisulfite sequence 1>
9599 (7) <equivalent genomic sequence 1>
9600 (8) <methylation call 1>
9601 (9) <observed bisulfite sequence 2>
9602 (10) <equivalent genomic sequence 2>
9603 (11) <methylation call 2>
9604 (12) <read 1 conversion
9605 (13) <genome conversion>
9606 (14) <read 1 quality score (Phred33)>
9607 (15) <read 2 quality score (Phred33)>
9608
9609
9610 Bismark SAM OUTPUT (default):
9611
9612 (1) QNAME (seq-ID)
9613 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
9614 (3) RNAME (chromosome)
9615 (4) POS (start position)
9616 (5) MAPQ (always 255 for use with Bowtie)
9617 (6) CIGAR
9618 (7) RNEXT
9619 (8) PNEXT
9620 (9) TLEN
9621 (10) SEQ
9622 (11) QUAL (Phred33 scale)
9623 (12) NM-tag (edit distance to the reference)
9624 (13) MD-tag (base-by-base mismatches to the reference (handles indels)
9625 (14) XM-tag (methylation call string)
9626 (15) XR-tag (read conversion state for the alignment)
9627 (16) XG-tag (genome conversion state for the alignment)
9628 (17) XA/XB-tag (non-bisulfite mismatches) (optional!)
9629
9630 Each read of paired-end alignments is written out in a separate line in the above format.
9631
9632
9633 Last edited on 06 May 2015.
9634
9635 HOW_TO
9636 }