comparison bismark_wrapper/bismark @ 1:183de9d00131 draft

add indices.loc files
author bjoern-gruening
date Tue, 25 Dec 2012 05:52:28 -0500
parents
children
comparison
equal deleted inserted replaced
0:36d124f44c0a 1:183de9d00131
1 #!/usr/bin/perl --
2 use strict;
3 use warnings;
4 use IO::Handle;
5 use Cwd;
6 $|++;
7 use Getopt::Long;
8
9
10 ## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@babraham.ac.uk)
11
12 ## This program is free software: you can redistribute it and/or modify
13 ## it under the terms of the GNU General Public License as published by
14 ## the Free Software Foundation, either version 3 of the License, or
15 ## (at your option) any later version.
16
17 ## This program is distributed in the hope that it will be useful,
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ## GNU General Public License for more details.
21
22 ## You should have received a copy of the GNU General Public License
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
24
25
26 my $parent_dir = getcwd;
27 my $bismark_version = 'v0.7.7';
28 my $command_line = join (" ",@ARGV);
29
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
31 foreach my $arg (@ARGV){
32 if ($arg eq '--solexa1.3-quals'){
33 $arg = '--phred64-quals';
34 }
35 }
36 my @filenames; # will be populated by processing the command line
37
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir) = process_command_line();
39
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
42 my %counting; # counting various events
43
44 my $seqID_contains_tabs;
45
46 foreach my $filename (@filenames){
47
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
49 ### resetting the counting hash and fhs
50 reset_counters_and_fhs($filename);
51 $seqID_contains_tabs = 0;
52
53 ### PAIRED-END ALIGNMENTS
54 if ($filename =~ ','){
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
56
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
61
62 print "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
63
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
65 print "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
66
67 ### additional variables only for paired-end alignments
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
69
70 ### FastA format
71 if ($sequence_file_format eq 'FASTA'){
72 print "Input files are in FastA format\n";
73
74 if ($directional){
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
77
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
80 $fhs[1]->{inputfile_1} = undef;
81 $fhs[1]->{inputfile_2} = undef;
82 $fhs[2]->{inputfile_1} = undef;
83 $fhs[2]->{inputfile_2} = undef;
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
86 }
87 else{
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
90
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
99 }
100
101 if ($bowtie2){
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
103 }
104 else{
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
106 }
107 }
108
109 ### FastQ format
110 else{
111 print "Input files are in FastQ format\n";
112 if ($directional){
113 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
114 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
115
116 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
117 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
118 $fhs[1]->{inputfile_1} = undef;
119 $fhs[1]->{inputfile_2} = undef;
120 $fhs[2]->{inputfile_1} = undef;
121 $fhs[2]->{inputfile_2} = undef;
122 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
123 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
124 }
125 else{
126 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
127 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
128
129 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
130 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
131 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
132 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
133 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
134 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
135 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
136 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
137 }
138
139 if ($bowtie2){
140 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
141 }
142 else{
143 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
144 }
145 }
146 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
147 }
148
149 ### Else we are performing SINGLE-END ALIGNMENTS
150 else{
151 print "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
152 ### Initialising bisulfite conversion filenames
153 my ($C_to_T_infile,$G_to_A_infile);
154
155
156 ### FastA format
157 if ($sequence_file_format eq 'FASTA'){
158 print "Inut file is in FastA format\n";
159 if ($directional){
160 ($C_to_T_infile) = biTransformFastAFiles ($filename);
161 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
162 }
163 else{
164 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
165 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
166 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
167 }
168
169 ### Creating 4 different bowtie filehandles and storing the first entry
170 if ($bowtie2){
171 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
172 }
173 else{
174 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
175 }
176 }
177
178 ## FastQ format
179 else{
180 print "Input file is in FastQ format\n";
181 if ($directional){
182 ($C_to_T_infile) = biTransformFastQFiles ($filename);
183 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
184 }
185 else{
186 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
187 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
188 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
189 }
190
191 ### Creating 4 different bowtie filehandles and storing the first entry
192 if ($bowtie2){
193 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
194 }
195 else{
196 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
197 }
198 }
199
200 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
201
202 }
203 }
204
205 sub start_methylation_call_procedure_single_ends {
206 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
207 my ($dir,$filename);
208
209 if ($sequence_file =~ /\//){
210 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
211 }
212 else{
213 $filename = $sequence_file;
214 }
215
216 ### printing all alignments to a results file
217 my $outfile = $filename;
218
219 if ($bowtie2){ # SAM format is the default for Bowtie 2
220 $outfile =~ s/$/_bt2_bismark.sam/;
221 }
222 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
223 $outfile =~ s/$/_bismark.txt/;
224 }
225 else{ # SAM is the default output
226 $outfile =~ s/$/_bismark.sam/;
227 }
228 print "Writing bisulfite mapping results to $output_dir$outfile\n\n";
229 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
230 if ($vanilla){
231 print OUT "Bismark version: $bismark_version\n";
232 }
233
234 ### printing alignment and methylation call summary to a report file
235 my $reportfile = $filename;
236 if ($bowtie2){
237 $reportfile =~ s/$/_bt2_Bismark_mapping_report.txt/;
238 }
239 else{
240 $reportfile =~ s/$/_Bismark_mapping_report.txt/;
241 }
242
243 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
244 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
245
246 if ($unmapped){
247 my $unmapped_file = $filename;
248 $unmapped_file =~ s/$/_unmapped_reads.txt/;
249 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
250 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
251 }
252 if ($ambiguous){
253 my $ambiguous_file = $filename;
254 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
255 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
256 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
257 }
258
259 if ($directional){
260 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
261 }
262 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
263
264
265 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
266 unless (%chromosomes){
267 my $cwd = getcwd; # storing the path of the current working directory
268 print "Current working directory is: $cwd\n\n";
269 read_genome_into_memory($cwd);
270 }
271
272 unless ($vanilla or $sam_no_hd){
273 generate_SAM_header();
274 }
275
276 ### Input file is in FastA format
277 if ($sequence_file_format eq 'FASTA'){
278 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
279 }
280 ### Input file is in FastQ format
281 else{
282 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
283 }
284 }
285
286 sub start_methylation_call_procedure_paired_ends {
287 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
288
289 my ($dir_1,$filename_1);
290
291 if ($sequence_file_1 =~ /\//){
292 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
293 }
294 else{
295 $filename_1 = $sequence_file_1;
296 }
297
298 my ($dir_2,$filename_2);
299
300 if ($sequence_file_2 =~ /\//){
301 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
302 }
303 else{
304 $filename_2 = $sequence_file_2;
305 }
306
307 ### printing all alignments to a results file
308 my $outfile = $filename_1;
309 if ($bowtie2){ # SAM format is the default Bowtie 2 output
310 $outfile =~ s/$/_bismark_bt2_pe.sam/;
311 }
312 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
313 $outfile =~ s/$/_bismark_pe.txt/;
314 }
315 else{ # SAM format is the default Bowtie 1 output
316 $outfile =~ s/$/_bismark_pe.sam/;
317 }
318
319 print "Writing bisulfite mapping results to $outfile\n\n";
320 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!";
321 if ($vanilla){
322 print OUT "Bismark version: $bismark_version\n";
323 }
324
325 ### printing alignment and methylation call summary to a report file
326 my $reportfile = $filename_1;
327 if ($bowtie2){
328 $reportfile =~ s/$/_Bismark_bt2_paired-end_mapping_report.txt/;
329 }
330 else{
331 $reportfile =~ s/$/_Bismark_paired-end_mapping_report.txt/;
332 }
333
334 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
335 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
336 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
337
338
339 ### Unmapped read output
340 if ($unmapped){
341 my $unmapped_1 = $filename_1;
342 my $unmapped_2 = $filename_2;
343 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
344 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
345 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
346 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
347 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
348 }
349
350 if ($ambiguous){
351 my $amb_1 = $filename_1;
352 my $amb_2 = $filename_2;
353 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
354 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
355 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
356 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
357 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
358 }
359
360 if ($directional){
361 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
362 }
363
364 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
365 unless (%chromosomes){
366 my $cwd = getcwd; # storing the path of the current working directory
367 print "Current working directory is: $cwd\n\n";
368 read_genome_into_memory($cwd);
369 }
370
371 unless ($vanilla or $sam_no_hd){
372 generate_SAM_header();
373 }
374
375 ### Input files are in FastA format
376 if ($sequence_file_format eq 'FASTA'){
377 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
378 }
379 ### Input files are in FastQ format
380 else{
381 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
382 }
383 }
384
385 sub print_final_analysis_report_single_end{
386 my ($C_to_T_infile,$G_to_A_infile) = @_;
387 ### All sequences from the original sequence file have been analysed now
388 ### deleting temporary C->T or G->A infiles
389
390 if ($directional){
391 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
392 if ($deletion_successful == 1){
393 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
394 }
395 else{
396 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
397 }
398 }
399
400 else{
401 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
402 if ($deletion_successful == 2){
403 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
404 }
405 else{
406 warn "Could not delete temporary files properly $!\n";
407 }
408 }
409
410 ### printing a final report for the alignment procedure
411 print REPORT "Final Alignment report\n",'='x22,"\n";
412 print "Final Alignment report\n",'='x22,"\n";
413 # foreach my $index (0..$#fhs){
414 # print "$fhs[$index]->{name}\n";
415 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
416 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
417 # }
418
419 ### printing a final report for the methylation call procedure
420 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
421 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
422 my $percent_alignable_sequences;
423
424 if ($counting{sequences_count} == 0){
425 $percent_alignable_sequences = 0;
426 }
427 else{
428 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
429 }
430
431 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
432 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
433
434 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
435 ### only calculating the percentage if there were any overruled alignments
436 if ($counting{low_complexity_alignments_overruled_count}){
437 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
438 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
439 }
440
441 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
442 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
443 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
444 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
445 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
446
447 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
448 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
449 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
450 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
451 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
452
453 if ($directional){
454 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
455 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
456 }
457
458 ### detailed information about Cs analysed
459 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
460 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
461 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
462 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
463 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
464 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
465 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
466 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
467 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
468
469 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
470 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
471 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
472 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
473 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
474 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
475 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
476 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
477
478 my $percent_meCHG;
479 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
480 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
481 }
482
483 my $percent_meCHH;
484 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
485 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
486 }
487
488 my $percent_meCpG;
489 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
490 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
491 }
492
493 ### printing methylated CpG percentage if applicable
494 if ($percent_meCpG){
495 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
496 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
497 }
498 else{
499 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
500 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
501 }
502
503 ### printing methylated C percentage (CHG context) if applicable
504 if ($percent_meCHG){
505 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
506 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
507 }
508 else{
509 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
510 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
511 }
512
513 ### printing methylated C percentage (CHH context) if applicable
514 if ($percent_meCHH){
515 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
516 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
517 }
518 else{
519 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
520 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
521 }
522
523 if ($seqID_contains_tabs){
524 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
525 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
526 }
527 }
528
529 sub print_final_analysis_report_paired_ends{
530 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
531 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
532 if ($directional){
533 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
534 if ($deletion_successful == 2){
535 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
536 }
537 else{
538 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
539 }
540 }
541 else{
542 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
543 if ($deletion_successful == 4){
544 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
545 }
546 else{
547 warn "Could not delete temporary files properly: $!\n";
548 }
549 }
550
551 ### printing a final report for the alignment procedure
552 warn "Final Alignment report\n",'='x22,"\n";
553 print REPORT "Final Alignment report\n",'='x22,"\n";
554 # foreach my $index (0..$#fhs){
555 # print "$fhs[$index]->{name}\n";
556 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
557 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
558 # }
559
560 ### printing a final report for the methylation call procedure
561 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
562 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
563
564 my $percent_alignable_sequence_pairs;
565 if ($counting{sequences_count} == 0){
566 $percent_alignable_sequence_pairs = 0;
567 }
568 else{
569 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
570 }
571 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
572 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
573
574 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
575 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
576 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
577 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
578 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
579
580
581 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
582 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
583 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
584 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
585 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
586 ### detailed information about Cs analysed
587
588 if ($directional){
589 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
590 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
591 }
592
593 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
594 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
595
596 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
597 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
598 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
599 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
600 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
601 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
602 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
603 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
604
605 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
606 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
607 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
608 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
609 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
610 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
611 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
612
613 my $percent_meCHG;
614 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
615 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
616 }
617
618 my $percent_meCHH;
619 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
620 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
621 }
622
623 my $percent_meCpG;
624 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
625 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
626 }
627
628 ### printing methylated CpG percentage if applicable
629 if ($percent_meCpG){
630 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
631 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
632 }
633 else{
634 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
635 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
636 }
637
638 ### printing methylated C percentage in CHG context if applicable
639 if ($percent_meCHG){
640 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
641 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
642 }
643 else{
644 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
645 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
646 }
647
648 ### printing methylated C percentage in CHH context if applicable
649 if ($percent_meCHH){
650 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
651 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
652 }
653 else{
654 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
655 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
656 }
657
658 }
659
660 sub process_single_end_fastA_file_for_methylation_call{
661 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
662 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
663 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
664 ### the C->T or G->A version
665
666 ### gzipped version of the infile
667 if ($sequence_file =~ /\.gz$/){
668 open (IN,"zcat $sequence_file |") or die $!;
669 }
670 else{
671 open (IN,$sequence_file) or die $!;
672 }
673
674 my $count = 0;
675
676 warn "\nReading in the sequence file $sequence_file\n";
677 while (1) {
678 # last if ($counting{sequences_count} > 100);
679 my $identifier = <IN>;
680 my $sequence = <IN>;
681 last unless ($identifier and $sequence);
682
683 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
684
685 ++$count;
686
687 if ($skip){
688 next unless ($count > $skip);
689 }
690 if ($upto){
691 last if ($count > $upto);
692 }
693
694 $counting{sequences_count}++;
695 if ($counting{sequences_count}%100000==0) {
696 warn "Processed $counting{sequences_count} sequences so far\n";
697 }
698 chomp $sequence;
699 chomp $identifier;
700
701 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
702
703 my $return;
704 if ($bowtie2){
705 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
706 }
707 else{
708 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
709 }
710
711 unless ($return){
712 $return = 0;
713 }
714
715 # print the sequence to ambiguous.out if --ambiguous was specified
716 if ($ambiguous and $return == 2){
717 print AMBIG ">$identifier\n";
718 print AMBIG "$sequence\n";
719 }
720
721 # print the sequence to <unmapped.out> file if --un was specified
722 elsif ($unmapped and $return == 1){
723 print UNMAPPED ">$identifier\n";
724 print UNMAPPED "$sequence\n";
725 }
726 }
727 print "Processed $counting{sequences_count} sequences in total\n\n";
728
729 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
730
731 }
732
733 sub process_single_end_fastQ_file_for_methylation_call{
734 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
735 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
736 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
737 ### the C->T or G->A version
738
739 ### gzipped version of the infile
740 if ($sequence_file =~ /\.gz$/){
741 open (IN,"zcat $sequence_file |") or die $!;
742 }
743 else{
744 open (IN,$sequence_file) or die $!;
745 }
746
747 my $count = 0;
748
749 warn "\nReading in the sequence file $sequence_file\n";
750 while (1) {
751 my $identifier = <IN>;
752 my $sequence = <IN>;
753 my $identifier_2 = <IN>;
754 my $quality_value = <IN>;
755 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
756
757 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
758
759 ++$count;
760
761 if ($skip){
762 next unless ($count > $skip);
763 }
764 if ($upto){
765 last if ($count > $upto);
766 }
767
768 $counting{sequences_count}++;
769
770 if ($counting{sequences_count}%1000000==0) {
771 warn "Processed $counting{sequences_count} sequences so far\n";
772 }
773 chomp $sequence;
774 chomp $identifier;
775 chomp $quality_value;
776
777 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
778
779 my $return;
780 if ($bowtie2){
781 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
782 }
783 else{
784 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
785 }
786
787 unless ($return){
788 $return = 0;
789 }
790
791 # print the sequence to ambiguous.out if --ambiguous was specified
792 if ($ambiguous and $return == 2){
793 print AMBIG "\@$identifier\n";
794 print AMBIG "$sequence\n";
795 print AMBIG $identifier_2;
796 print AMBIG "$quality_value\n";
797 }
798
799 # print the sequence to <unmapped.out> file if --un was specified
800 elsif ($unmapped and $return == 1){
801 print UNMAPPED "\@$identifier\n";
802 print UNMAPPED "$sequence\n";
803 print UNMAPPED $identifier_2;
804 print UNMAPPED "$quality_value\n";
805 }
806 }
807 print "Processed $counting{sequences_count} sequences in total\n\n";
808
809 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
810
811 }
812
813 sub process_fastA_files_for_paired_end_methylation_calls{
814 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
815 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
816 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
817 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
818 ### converted genomes (either the C->T or G->A version)
819
820 ### gzipped version of the infiles
821 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
822 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
823 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
824 }
825 else{
826 open (IN1,$sequence_file_1) or die $!;
827 open (IN2,$sequence_file_2) or die $!;
828 }
829
830 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
831 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
832
833 my $count = 0;
834
835 while (1) {
836 # reading from the first input file
837 my $identifier_1 = <IN1>;
838 my $sequence_1 = <IN1>;
839 # reading from the second input file
840 my $identifier_2 = <IN2>;
841 my $sequence_2 = <IN2>;
842 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
843
844 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
845 $identifier_2 = fix_IDs($identifier_2);
846
847 ++$count;
848
849 if ($skip){
850 next unless ($count > $skip);
851 }
852 if ($upto){
853 last if ($count > $upto);
854 }
855
856 $counting{sequences_count}++;
857 if ($counting{sequences_count}%100000==0) {
858 warn "Processed $counting{sequences_count} sequences so far\n";
859 }
860 my $orig_identifier_1 = $identifier_1;
861 my $orig_identifier_2 = $identifier_2;
862
863 chomp $sequence_1;
864 chomp $identifier_1;
865 chomp $sequence_2;
866 chomp $identifier_2;
867
868 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
869
870 my $return;
871 if ($bowtie2){
872 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
873 }
874 else{
875 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
876 }
877
878 unless ($return){
879 $return = 0;
880 }
881
882 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
883 if ($ambiguous and $return == 2){
884 print AMBIG_1 $orig_identifier_1;
885 print AMBIG_1 "$sequence_1\n";
886 print AMBIG_2 $orig_identifier_2;
887 print AMBIG_2 "$sequence_2\n";
888 }
889
890 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
891 elsif ($unmapped and $return == 1){
892 print UNMAPPED_1 $orig_identifier_1;
893 print UNMAPPED_1 "$sequence_1\n";
894 print UNMAPPED_2 $orig_identifier_2;
895 print UNMAPPED_2 "$sequence_2\n";
896 }
897 }
898
899 print "Processed $counting{sequences_count} sequences in total\n\n";
900
901 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
902
903 }
904
905 sub process_fastQ_files_for_paired_end_methylation_calls{
906 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
907 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
908 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
909 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
910 ### of the converted genomes (either C->T or G->A version)
911
912 ### gzipped version of the infiles
913 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
914 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
915 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
916 }
917 else{
918 open (IN1,$sequence_file_1) or die $!;
919 open (IN2,$sequence_file_2) or die $!;
920 }
921
922 my $count = 0;
923
924 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
925 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
926 while (1) {
927 # reading from the first input file
928 my $identifier_1 = <IN1>;
929 my $sequence_1 = <IN1>;
930 my $ident_1 = <IN1>; # not needed
931 my $quality_value_1 = <IN1>; # not needed
932 # reading from the second input file
933 my $identifier_2 = <IN2>;
934 my $sequence_2 = <IN2>;
935 my $ident_2 = <IN2>; # not needed
936 my $quality_value_2 = <IN2>; # not needed
937 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
938
939 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
940 $identifier_2 = fix_IDs($identifier_2);
941
942 ++$count;
943
944 if ($skip){
945 next unless ($count > $skip);
946 }
947 if ($upto){
948 last if ($count > $upto);
949 }
950
951 $counting{sequences_count}++;
952 if ($counting{sequences_count}%100000==0) {
953 warn "Processed $counting{sequences_count} sequences so far\n";
954 }
955
956 my $orig_identifier_1 = $identifier_1;
957 my $orig_identifier_2 = $identifier_2;
958
959 chomp $sequence_1;
960 chomp $identifier_1;
961 chomp $sequence_2;
962 chomp $identifier_2;
963 chomp $quality_value_1;
964 chomp $quality_value_2;
965
966 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
967
968 my $return;
969 if ($bowtie2){
970 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
971 }
972 else{
973 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
974 }
975
976 unless ($return){
977 $return = 0;
978 }
979
980 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
981 if ($ambiguous and $return == 2){
982 # seq_1
983 print AMBIG_1 $orig_identifier_1;
984 print AMBIG_1 "$sequence_1\n";
985 print AMBIG_1 $ident_1;
986 print AMBIG_1 "$quality_value_1\n";
987 # seq_2
988 print AMBIG_2 $orig_identifier_2;
989 print AMBIG_2 "$sequence_2\n";
990 print AMBIG_2 $ident_2;
991 print AMBIG_2 "$quality_value_2\n";
992 }
993
994 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
995 elsif ($unmapped and $return == 1){
996 # seq_1
997 print UNMAPPED_1 $orig_identifier_1;
998 print UNMAPPED_1 "$sequence_1\n";
999 print UNMAPPED_1 $ident_1;
1000 print UNMAPPED_1 "$quality_value_1\n";
1001 # seq_2
1002 print UNMAPPED_2 $orig_identifier_2;
1003 print UNMAPPED_2 "$sequence_2\n";
1004 print UNMAPPED_2 $ident_2;
1005 print UNMAPPED_2 "$quality_value_2\n";
1006 }
1007 }
1008
1009 print "Processed $counting{sequences_count} sequences in total\n\n";
1010
1011 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
1012
1013 }
1014
1015 sub check_bowtie_results_single_end{
1016 my ($sequence,$identifier,$quality_value) = @_;
1017
1018 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
1019 $quality_value = 'I'x(length$sequence);
1020 }
1021
1022 my %mismatches = ();
1023 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
1024 foreach my $index (0..$#fhs){
1025
1026 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1027 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
1028 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
1029 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1030 ###############################################################
1031 ### STEP I Now processing the alignment stored in last_line ###
1032 ###############################################################
1033 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
1034 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
1035 ### we only continue to extract useful information about this alignment if 1 was returned
1036 if ($valid_alignment_found_1 == 1){
1037 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
1038 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
1039 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
1040
1041 unless($mismatch_info){
1042 $mismatch_info = '';
1043 }
1044
1045 chomp $mismatch_info;
1046 my $chromosome;
1047 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1048 $chromosome = $mapped_chromosome;
1049 }
1050 else{
1051 die "Chromosome number extraction failed for $mapped_chromosome\n";
1052 }
1053 ### Now extracting the number of mismatches to the converted genome
1054 my $number_of_mismatches;
1055 if ($mismatch_info eq ''){
1056 $number_of_mismatches = 0;
1057 }
1058 elsif ($mismatch_info =~ /^\d/){
1059 my @mismatches = split (/,/,$mismatch_info);
1060 $number_of_mismatches = scalar @mismatches;
1061 }
1062 else{
1063 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
1064 }
1065 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1066 my $alignment_location = join (":",$chromosome,$position);
1067 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1068 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
1069 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
1070 ### number for the found alignment)
1071 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
1072 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
1073 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
1074 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
1075 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
1076 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
1077 }
1078 $number_of_mismatches = undef;
1079 ##################################################################################################################################################
1080 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
1081 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
1082 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
1083 ##################################################################################################################################################
1084 my $newline = $fhs[$index]->{fh}-> getline();
1085 if ($newline){
1086 my ($seq_id) = split (/\t/,$newline);
1087 $fhs[$index]->{last_seq_id} = $seq_id;
1088 $fhs[$index]->{last_line} = $newline;
1089 }
1090 else {
1091 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
1092 $fhs[$index]->{last_seq_id} = undef;
1093 $fhs[$index]->{last_line} = undef;
1094 next;
1095 }
1096 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
1097 ### we only continue to extract useful information about this second alignment if 1 was returned
1098 if ($valid_alignment_found_2 == 1){
1099 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
1100 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
1101 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
1102 unless($mismatch_info){
1103 $mismatch_info = '';
1104 }
1105 chomp $mismatch_info;
1106
1107 my $chromosome;
1108 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1109 $chromosome = $mapped_chromosome;
1110 }
1111 else{
1112 die "Chromosome number extraction failed for $mapped_chromosome\n";
1113 }
1114
1115 ### Now extracting the number of mismatches to the converted genome
1116 my $number_of_mismatches;
1117 if ($mismatch_info eq ''){
1118 $number_of_mismatches = 0;
1119 }
1120 elsif ($mismatch_info =~ /^\d/){
1121 my @mismatches = split (/,/,$mismatch_info);
1122 $number_of_mismatches = scalar @mismatches;
1123 }
1124 else{
1125 die "Something weird is going on with the mismatch field\n";
1126 }
1127 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1128 ### extracting the chromosome number from the bowtie output (see above)
1129 my $alignment_location = join (":",$chromosome,$position);
1130 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
1131 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
1132 ### case we are not writing the same entry out a second time.
1133 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
1134 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
1135 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
1136 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
1137 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
1138 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
1139 }
1140 ####################################################################################################################################
1141 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
1142 ####################################################################################################################################
1143 $newline = $fhs[$index]->{fh}-> getline();
1144 if ($newline){
1145 my ($seq_id) = split (/\t/,$newline);
1146 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
1147 $fhs[$index]->{last_seq_id} = $seq_id;
1148 $fhs[$index]->{last_line} = $newline;
1149 next;
1150 }
1151 else {
1152 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
1153 $fhs[$index]->{last_seq_id} = undef;
1154 $fhs[$index]->{last_line} = undef;
1155 next;
1156 }
1157 ### still within the 2nd sequence in correct orientation found
1158 }
1159 ### still withing the 1st sequence in correct orientation found
1160 }
1161 ### still within the if (last_seq_id eq identifier) condition
1162 }
1163 ### still within foreach index loop
1164 }
1165 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
1166 unless(%mismatches){
1167 $counting{no_single_alignment_found}++;
1168 if ($unmapped){
1169 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
1170 }
1171 else{
1172 return;
1173 }
1174 }
1175 #######################################################################################################################################################
1176 #######################################################################################################################################################
1177 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
1178 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
1179 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
1180 #######################################################################################################################################################
1181 #######################################################################################################################################################
1182 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
1183 my $sequence_fails = 0;
1184 ### Declaring an empty hash reference which will store all information we need for the methylation call
1185 my $methylation_call_params; # hash reference!
1186 ### sorting in ascending order
1187 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
1188
1189 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
1190 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
1191 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
1192 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
1193 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
1194 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
1195 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
1196 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
1197 }
1198 }
1199 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
1200 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
1201 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
1202 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
1203 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
1204 ### reaction. E.g.
1205 ### CAGTCACGCGCGCGCG will become
1206 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
1207 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
1208 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
1209 ### G->A conversion:
1210 ### highly methylated: CAATCACACACACACA
1211 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
1212 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
1213 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
1214 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
1215 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
1216 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
1217 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
1218 ### In the above example the number of transliterations required to transform the actual sequence
1219 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
1220 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
1221 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
1222 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
1223 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
1224 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
1225 my @three_candidate_seqs;
1226 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
1227 my $transliterations_performed;
1228 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
1229 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
1230 }
1231 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
1232 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
1233 }
1234 else{
1235 die "unexpected index number range $!\n";
1236 }
1237 push @three_candidate_seqs,{
1238 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
1239 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
1240 mismatch_number => $mismatch_number,
1241 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
1242 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
1243 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
1244 transliterations_performed => $transliterations_performed,
1245 };
1246 }
1247 ### sorting in ascending order for the lowest number of transliterations performed
1248 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
1249 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
1250 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
1251 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
1252 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
1253 if (($first_array_element*2) < $second_array_element){
1254 $counting{low_complexity_alignments_overruled_count}++;
1255 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
1256 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
1257 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
1258 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
1259 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
1260 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
1261 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
1262 }
1263 else{
1264 $sequence_fails = 1;
1265 }
1266 }
1267 else{
1268 $sequence_fails = 1;
1269 }
1270 ### after processing the alignment with the lowest number of mismatches we exit
1271 last;
1272 }
1273 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
1274 if ($sequence_fails == 1){
1275 $counting{unsuitable_sequence_count}++;
1276 if ($ambiguous){
1277 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
1278 }
1279 if ($unmapped){
1280 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
1281 }
1282 else{
1283 return 0; # => exits to next sequence (default)
1284 }
1285 }
1286
1287 ### --DIRECTIONAL
1288 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
1289 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
1290 if ($directional){
1291 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
1292 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
1293 $counting{alignments_rejected_count}++;
1294 return 0;
1295 }
1296 }
1297
1298 ### If the sequence has not been rejected so far it will have a unique best alignment
1299 $counting{unique_best_alignment_count}++;
1300 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
1301 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
1302 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
1303 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
1304 $counting{genomic_sequence_could_not_be_extracted_count}++;
1305 return 0;
1306 }
1307
1308 ### otherwise we are set to perform the actual methylation call
1309 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
1310
1311 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
1312 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
1313 }
1314
1315 sub check_bowtie_results_single_end_bowtie2{
1316 my ($sequence,$identifier,$quality_value) = @_;
1317
1318 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
1319 $quality_value = 'I'x(length$sequence);
1320 }
1321
1322 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
1323 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
1324
1325 my $alignment_ambiguous = 0;
1326
1327 my %alignments = ();
1328
1329 ### reading from the Bowtie 2 output filehandles
1330 foreach my $index (0..$#fhs){
1331 # print "Index: $index\n";
1332 # print "$fhs[$index]->{last_line}\n";
1333 # print "$fhs[$index]->{last_seq_id}\n\n";
1334
1335 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1336 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
1337
1338 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
1339 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
1340
1341 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1342
1343 # SAM format specifications for Bowtie 2
1344 # (1) Name of read that aligned
1345 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
1346 # 1 The read is one of a pair
1347 # 2 The alignment is one end of a proper paired-end alignment
1348 # 4 The read has no reported alignments
1349 # 8 The read is one of a pair and has no reported alignments
1350 # 16 The alignment is to the reverse reference strand
1351 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
1352 # 64 The read is mate 1 in a pair
1353 # 128 The read is mate 2 in a pair
1354 # 256 The read has multiple mapping states
1355 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
1356 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
1357 # (5) Mapping quality (255 means MAPQ is not available)
1358 # (6) CIGAR string representation of alignment (* if unavailable)
1359 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
1360 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
1361 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
1362 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
1363 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
1364 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
1365 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
1366 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
1367 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
1368 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
1369 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
1370 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
1371 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
1372 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
1373 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
1374 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
1375
1376 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
1377
1378 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
1379 if ($flag == 4){
1380 ## reading in the next alignment, which must be the next sequence
1381 my $newline = $fhs[$index]->{fh}-> getline();
1382 if ($newline){
1383 chomp $newline;
1384 my ($seq_id) = split (/\t/,$newline);
1385 $fhs[$index]->{last_seq_id} = $seq_id;
1386 $fhs[$index]->{last_line} = $newline;
1387 if ($seq_id eq $identifier){
1388 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
1389 }
1390 next; # next instance
1391 }
1392 else{
1393 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1394 $fhs[$index]->{last_seq_id} = undef;
1395 $fhs[$index]->{last_line} = undef;
1396 next;
1397 }
1398 }
1399
1400 # if there are one or more proper alignments we can extract the chromosome number
1401 my $chromosome;
1402 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
1403 $chromosome = $mapped_chromosome;
1404 }
1405 else{
1406 die "Chromosome number extraction failed for $mapped_chromosome\n";
1407 }
1408
1409 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
1410 my ($alignment_score,$second_best,$MD_tag);
1411 my @fields = split (/\t/,$fhs[$index]->{last_line});
1412
1413 foreach (11..$#fields){
1414 if ($fields[$_] =~ /AS:i:(.*)/){
1415 $alignment_score = $1;
1416 }
1417 elsif ($fields[$_] =~ /XS:i:(.*)/){
1418 $second_best = $1;
1419 }
1420 elsif ($fields[$_] =~ /MD:Z:(.*)/){
1421 $MD_tag = $1;
1422 }
1423 }
1424
1425 # warn "First best alignment_score is: '$alignment_score'\n";
1426 # warn "MD tag is: '$MD_tag'\n";
1427 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
1428
1429 if (defined $second_best){
1430 # warn "second best alignment_score is: '$second_best'\n";
1431
1432 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
1433 if ($alignment_score == $second_best){
1434 $alignment_ambiguous = 1;
1435 ## need to read and discard all additional ambiguous reads until we reach the next sequence
1436 until ($fhs[$index]->{last_seq_id} ne $identifier){
1437 my $newline = $fhs[$index]->{fh}-> getline();
1438 if ($newline){
1439 chomp $newline;
1440 my ($seq_id) = split (/\t/,$newline);
1441 $fhs[$index]->{last_seq_id} = $seq_id;
1442 $fhs[$index]->{last_line} = $newline;
1443 }
1444 else{
1445 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1446 $fhs[$index]->{last_seq_id} = undef;
1447 $fhs[$index]->{last_line} = undef;
1448 last; # break free in case we have reached the end of the alignment output
1449 }
1450 }
1451 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
1452 }
1453 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
1454
1455 my $alignment_location = join (":",$chromosome,$position);
1456
1457 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1458 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
1459 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
1460 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
1461
1462 unless (exists $alignments{$alignment_location}){
1463 $alignments{$alignment_location}->{seq_id} = $id;
1464 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
1465 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
1466 $alignments{$alignment_location}->{index} = $index;
1467 $alignments{$alignment_location}->{chromosome} = $chromosome;
1468 $alignments{$alignment_location}->{position} = $position;
1469 $alignments{$alignment_location}->{CIGAR} = $cigar;
1470 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
1471 }
1472
1473 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
1474 until ($fhs[$index]->{last_seq_id} ne $identifier){
1475 my $newline = $fhs[$index]->{fh}-> getline();
1476 if ($newline){
1477 chomp $newline;
1478 my ($seq_id) = split (/\t/,$newline);
1479 $fhs[$index]->{last_seq_id} = $seq_id;
1480 $fhs[$index]->{last_line} = $newline;
1481 }
1482 else{
1483 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1484 $fhs[$index]->{last_seq_id} = undef;
1485 $fhs[$index]->{last_line} = undef;
1486 last; # break free in case we have reached the end of the alignment output
1487 }
1488 }
1489 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
1490 }
1491 }
1492 else{ # there is no second best hit, so we can just store this one and read in the next sequence
1493
1494 my $alignment_location = join (":",$chromosome,$position);
1495
1496 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1497 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
1498 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
1499 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
1500
1501 unless (exists $alignments{$alignment_location}){
1502 $alignments{$alignment_location}->{seq_id} = $id;
1503 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
1504 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
1505 $alignments{$alignment_location}->{index} = $index;
1506 $alignments{$alignment_location}->{chromosome} = $chromosome;
1507 $alignments{$alignment_location}->{position} = $position;
1508 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
1509 $alignments{$alignment_location}->{CIGAR} = $cigar;
1510 }
1511
1512 my $newline = $fhs[$index]->{fh}-> getline();
1513 if ($newline){
1514 chomp $newline;
1515 my ($seq_id) = split (/\t/,$newline);
1516 $fhs[$index]->{last_seq_id} = $seq_id;
1517 $fhs[$index]->{last_line} = $newline;
1518 if ($seq_id eq $identifier){
1519 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
1520 }
1521 }
1522 else{
1523 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
1524 $fhs[$index]->{last_seq_id} = undef;
1525 $fhs[$index]->{last_line} = undef;
1526 }
1527 }
1528 }
1529 }
1530
1531 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
1532 if ($alignment_ambiguous == 1){
1533 $counting{unsuitable_sequence_count}++;
1534 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
1535 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
1536 # print "$ambiguous_read_output\n";
1537
1538 if ($ambiguous){
1539 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
1540 }
1541 elsif ($unmapped){
1542 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
1543 }
1544 else{
1545 return 0;
1546 }
1547 }
1548
1549 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
1550 unless(%alignments){
1551 $counting{no_single_alignment_found}++;
1552 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
1553 # print "$unmapped_read_output\n";
1554 if ($unmapped){
1555 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
1556 }
1557 else{
1558 return 0; # default
1559 }
1560 }
1561
1562 #######################################################################################################################################################
1563
1564 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
1565 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
1566 ### alignment score we are discarding the sequence altogether.
1567 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
1568 ### opening (5) and extending (3 per bp) the gap.
1569
1570 #######################################################################################################################################################
1571
1572 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
1573 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
1574
1575 ### print contents of %alignments for debugging
1576 # if (scalar keys %alignments > 1){
1577 # print "\n******\n";
1578 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
1579 # print "Loc: $alignment_location\n";
1580 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
1581 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
1582 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
1583 # print "Index $alignments{$alignment_location}->{index}\n";
1584 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
1585 # print "pos: $alignments{$alignment_location}->{position}\n";
1586 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
1587 # }
1588 # print "\n******\n";
1589 # }
1590
1591 ### if there is only 1 entry in the hash with we accept it as the best alignment
1592 if (scalar keys %alignments == 1){
1593 for my $unique_best_alignment (keys %alignments){
1594 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
1595 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
1596 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
1597 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
1598 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
1599 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
1600 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
1601 }
1602 }
1603
1604 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
1605 ### we boot the sequence altogether
1606 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
1607 my $best_alignment_score;
1608 my $best_alignment_location;
1609 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
1610 # print "$alignments{$alignment_location}->{alignment_score}\n";
1611 unless (defined $best_alignment_score){
1612 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
1613 $best_alignment_location = $alignment_location;
1614 # print "setting best alignment score: $best_alignment_score\n";
1615 }
1616 else{
1617 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
1618 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
1619 # warn "Same alignment score, the sequence will get booted!\n";
1620 $sequence_fails = 1;
1621 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
1622 }
1623 ### else we are going to store the best alignment for further processing
1624 else{
1625 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
1626 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
1627 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
1628 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
1629 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
1630 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
1631 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
1632 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
1633 }
1634 }
1635 }
1636 }
1637 else{
1638 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
1639 }
1640
1641 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
1642 if ($sequence_fails == 1){
1643 $counting{unsuitable_sequence_count}++;
1644
1645 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
1646 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
1647 # print OUT "$ambiguous_read_output\n";
1648
1649 if ($ambiguous){
1650 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
1651 }
1652 elsif ($unmapped){
1653 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
1654 }
1655 else{
1656 return 0; # => exits to next sequence (default)
1657 }
1658 }
1659
1660 ### --DIRECTIONAL
1661 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
1662 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
1663 if ($directional){
1664 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
1665 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
1666 $counting{alignments_rejected_count}++;
1667 return 0;
1668 }
1669 }
1670
1671 ### If the sequence has not been rejected so far it has a unique best alignment
1672 $counting{unique_best_alignment_count}++;
1673
1674 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
1675 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
1676
1677 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
1678 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
1679 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
1680 $counting{genomic_sequence_could_not_be_extracted_count}++;
1681 return 0;
1682 }
1683
1684
1685 ### otherwise we are set to perform the actual methylation call
1686 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
1687 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
1688 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
1689 }
1690
1691
1692 sub determine_number_of_transliterations_performed{
1693 my ($sequence,$read_conversion) = @_;
1694 my $number_of_transliterations;
1695 if ($read_conversion eq 'CT'){
1696 $number_of_transliterations = $sequence =~ tr/C/T/;
1697 }
1698 elsif ($read_conversion eq 'GA'){
1699 $number_of_transliterations = $sequence =~ tr/G/A/;
1700 }
1701 else{
1702 die "Read conversion mode of the read was not specified $!\n";
1703 }
1704 return $number_of_transliterations;
1705 }
1706
1707 sub decide_whether_single_end_alignment_is_valid{
1708 my ($index,$identifier) = @_;
1709
1710 # extracting from Bowtie 1 format
1711 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
1712
1713 ### ensuring that the entry is the correct sequence
1714 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
1715 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
1716 ### sensible alignments
1717 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
1718 ### If the orientation was correct can we move on
1719 if ($orientation == 1){
1720 return 1; ### 1st possibility for a sequence to pass
1721 }
1722 ### If the alignment was in the wrong orientation we need to read in a new line
1723 elsif($orientation == 0){
1724 my $newline = $fhs[$index]->{fh}->getline();
1725 if ($newline){
1726 ($id,$strand) = (split (/\t/,$newline))[0,1];
1727
1728 ### ensuring that the next entry is still the correct sequence
1729 if ($id eq $identifier){
1730 ### checking orientation again
1731 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
1732 ### If the orientation was correct can we move on
1733 if ($orientation == 1){
1734 $fhs[$index]->{last_seq_id} = $id;
1735 $fhs[$index]->{last_line} = $newline;
1736 return 1; ### 2nd possibility for a sequence to pass
1737 }
1738 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
1739 elsif ($orientation == 0){
1740 $newline = $fhs[$index]->{fh}->getline();
1741 if ($newline){
1742 my ($seq_id) = split (/\t/,$newline);
1743 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
1744 ### the same fields of the just read next entry
1745 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
1746 $fhs[$index]->{last_seq_id} = $seq_id;
1747 $fhs[$index]->{last_line} = $newline;
1748 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
1749 }
1750 else{
1751 # assigning undef to last_seq_id and last_line (end of bowtie output)
1752 $fhs[$index]->{last_seq_id} = undef;
1753 $fhs[$index]->{last_line} = undef;
1754 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
1755 }
1756 }
1757 else{
1758 die "The orientation of the alignment must be either correct or incorrect\n";
1759 }
1760 }
1761 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
1762 else{
1763 $fhs[$index]->{last_seq_id} = $id;
1764 $fhs[$index]->{last_line} = $newline;
1765 return 0; # processing the new alignment result only in the next round
1766 }
1767 }
1768 else {
1769 # assigning undef to last_seq_id and last_line (end of bowtie output)
1770 $fhs[$index]->{last_seq_id} = undef;
1771 $fhs[$index]->{last_line} = undef;
1772 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
1773 }
1774 }
1775 else{
1776 die "The orientation of the alignment must be either correct or incorrect\n";
1777 }
1778 }
1779 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
1780 else{
1781 return 0;
1782 }
1783 }
1784 #########################
1785 ### BOWTIE 1 | PAIRED-END
1786 #########################
1787
1788 sub check_bowtie_results_paired_ends{
1789 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
1790
1791 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
1792 unless ($quality_value_1){
1793 $quality_value_1 = 'I'x(length$sequence_1);
1794 }
1795 unless ($quality_value_2){
1796 $quality_value_2 = 'I'x(length$sequence_2);
1797 }
1798
1799 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
1800
1801 my %mismatches = ();
1802 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
1803
1804
1805 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
1806 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
1807 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
1808 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
1809 ### strands are not being reported by specifying --directional
1810
1811 foreach my $index (0,3,1,2){
1812 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
1813 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
1814 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
1815 if ($fhs[$index]->{last_seq_id} eq $identifier) {
1816 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
1817
1818 ##################################################################################
1819 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
1820 ##################################################################################
1821 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
1822 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
1823 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
1824 if ($valid_alignment_found == 1){
1825 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
1826 ### we store the useful information in %mismatches
1827 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
1828 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
1829 chomp $mismatch_info_1;
1830 chomp $mismatch_info_2;
1831
1832 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
1833 my ($chromosome_1,$chromosome_2);
1834 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
1835 $chromosome_1 = $mapped_chromosome_1;
1836 }
1837 else{
1838 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
1839 }
1840 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
1841 $chromosome_2 = $mapped_chromosome_2;
1842 }
1843 else{
1844 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
1845 }
1846
1847 ### Now extracting the number of mismatches to the converted genome
1848 my $number_of_mismatches_1;
1849 my $number_of_mismatches_2;
1850 if ($mismatch_info_1 eq ''){
1851 $number_of_mismatches_1 = 0;
1852 }
1853 elsif ($mismatch_info_1 =~ /^\d/){
1854 my @mismatches = split (/,/,$mismatch_info_1);
1855 $number_of_mismatches_1 = scalar @mismatches;
1856 }
1857 else{
1858 die "Something weird is going on with the mismatch field\n";
1859 }
1860 if ($mismatch_info_2 eq ''){
1861 $number_of_mismatches_2 = 0;
1862 }
1863 elsif ($mismatch_info_2 =~ /^\d/){
1864 my @mismatches = split (/,/,$mismatch_info_2);
1865 $number_of_mismatches_2 = scalar @mismatches;
1866 }
1867 else{
1868 die "Something weird is going on with the mismatch field\n";
1869 }
1870 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
1871 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
1872 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1873 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
1874 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
1875 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
1876 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1877 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
1878 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
1879 ### number for the found alignment)
1880 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
1881 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
1882 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
1883 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
1884 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
1885 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
1886 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
1887 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
1888 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
1889 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
1890 }
1891 ###################################################################################################################################################
1892 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
1893 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
1894 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
1895 ### this round ###
1896 ###################################################################################################################################################
1897 my $newline_1 = $fhs[$index]->{fh}-> getline();
1898 my $newline_2 = $fhs[$index]->{fh}-> getline();
1899
1900 if ($newline_1 and $newline_2){
1901 my ($seq_id_1) = split (/\t/,$newline_1);
1902 my ($seq_id_2) = split (/\t/,$newline_2);
1903
1904 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
1905 $fhs[$index]->{last_seq_id} = $seq_id_1;
1906 }
1907 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
1908 $fhs[$index]->{last_seq_id} = $seq_id_2;
1909 }
1910 else{
1911 die "Either read 1 or read 2 needs to end on '/1'\n";
1912 }
1913
1914 $fhs[$index]->{last_line_1} = $newline_1;
1915 $fhs[$index]->{last_line_2} = $newline_2;
1916 }
1917 else {
1918 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
1919 $fhs[$index]->{last_seq_id} = undef;
1920 $fhs[$index]->{last_line_1} = undef;
1921 $fhs[$index]->{last_line_2} = undef;
1922 next; # jumping to the next index
1923 }
1924 ### Now processing the entry we just stored in last_line_1 and last_line_2
1925 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
1926 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
1927 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
1928 if ($valid_alignment_found == 1){
1929 ### we store the useful information in %mismatches
1930 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
1931 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
1932 chomp $mismatch_info_1;
1933 chomp $mismatch_info_2;
1934 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
1935 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
1936 $chromosome_1 = $mapped_chromosome_1;
1937 }
1938 else{
1939 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
1940 }
1941 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
1942 $chromosome_2 = $mapped_chromosome_2;
1943 }
1944 else{
1945 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
1946 }
1947
1948 $number_of_mismatches_1='';
1949 $number_of_mismatches_2='';
1950 ### Now extracting the number of mismatches to the converted genome
1951 if ($mismatch_info_1 eq ''){
1952 $number_of_mismatches_1 = 0;
1953 }
1954 elsif ($mismatch_info_1 =~ /^\d/){
1955 my @mismatches = split (/,/,$mismatch_info_1);
1956 $number_of_mismatches_1 = scalar @mismatches;
1957 }
1958 else{
1959 die "Something weird is going on with the mismatch field\n";
1960 }
1961 if ($mismatch_info_2 eq ''){
1962 $number_of_mismatches_2 = 0;
1963 }
1964 elsif ($mismatch_info_2 =~ /^\d/){
1965 my @mismatches = split (/,/,$mismatch_info_2);
1966 $number_of_mismatches_2 = scalar @mismatches;
1967 }
1968 else{
1969 die "Something weird is going on with the mismatch field\n";
1970 }
1971 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
1972 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
1973 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
1974 die "position 1 is greater than position 2" if ($position_1 > $position_2);
1975 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
1976 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
1977 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
1978 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
1979 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
1980 ### number for the found alignment)
1981 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
1982 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
1983 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
1984 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
1985 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
1986 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
1987 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
1988 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
1989 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
1990 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
1991 }
1992 ###############################################################################################################################################
1993 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
1994 ###############################################################################################################################################
1995 $newline_1 = $fhs[$index]->{fh}-> getline();
1996 $newline_2 = $fhs[$index]->{fh}-> getline();
1997
1998 if ($newline_1 and $newline_2){
1999 my ($seq_id_1) = split (/\t/,$newline_1);
2000 my ($seq_id_2) = split (/\t/,$newline_2);
2001
2002 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2003 $fhs[$index]->{last_seq_id} = $seq_id_1;
2004 }
2005 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2006 $fhs[$index]->{last_seq_id} = $seq_id_2;
2007 }
2008 $fhs[$index]->{last_line_1} = $newline_1;
2009 $fhs[$index]->{last_line_2} = $newline_2;
2010 }
2011 else {
2012 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
2013 $fhs[$index]->{last_seq_id} = undef;
2014 $fhs[$index]->{last_line_1} = undef;
2015 $fhs[$index]->{last_line_2} = undef;
2016 next; # jumping to the next index
2017 }
2018 ### within the 2nd sequence pair alignment in correct orientation found
2019 }
2020 ### within the 1st sequence pair alignment in correct orientation found
2021 }
2022 ### still within the (last_seq_id eq identifier) condition
2023 }
2024 ### still within foreach index loop
2025 }
2026 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
2027 unless(%mismatches){
2028 $counting{no_single_alignment_found}++;
2029 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
2030 }
2031 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2032 my $sequence_pair_fails = 0;
2033 ### Declaring an empty hash reference which will store all information we need for the methylation call
2034 my $methylation_call_params; # hash reference!
2035 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
2036 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
2037 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
2038 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
2039 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
2040 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
2041 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
2042 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
2043 }
2044 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
2045 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
2046 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
2047 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
2048 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
2049 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
2050 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
2051 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
2052 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
2053 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
2054 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
2055 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
2056 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
2057 }
2058 }
2059 else{
2060 $sequence_pair_fails = 1;
2061 }
2062 ### after processing the alignment with the lowest number of mismatches we exit
2063 last;
2064 }
2065 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
2066 if ($sequence_pair_fails == 1){
2067 $counting{unsuitable_sequence_count}++;
2068 if ($ambiguous){
2069 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
2070 }
2071 if ($unmapped){
2072 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
2073 }
2074 else{
2075 return 0; # => exits to next sequence (default)
2076 }
2077 }
2078
2079 ### --DIRECTIONAL
2080 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2081 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2082 if ($directional){
2083 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
2084 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2085 $counting{alignments_rejected_count}++;
2086 return 0;
2087 }
2088 }
2089
2090 ### If the sequence has not been rejected so far it does have a unique best alignment
2091 $counting{unique_best_alignment_count}++;
2092 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
2093
2094 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2095 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2096 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
2097 $counting{genomic_sequence_could_not_be_extracted_count}++;
2098 return 0;
2099 }
2100 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2101 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
2102 $counting{genomic_sequence_could_not_be_extracted_count}++;
2103 return 0;
2104 }
2105
2106 ### otherwise we are set to perform the actual methylation call
2107 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
2108 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
2109
2110 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
2111 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
2112 }
2113
2114 #########################
2115 ### BOWTIE 2 | PAIRED-END
2116 #########################
2117
2118 sub check_bowtie_results_paired_ends_bowtie2{
2119 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
2120
2121 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
2122 unless ($quality_value_1){
2123 $quality_value_1 = 'I'x(length$sequence_1);
2124 }
2125
2126 unless ($quality_value_2){
2127 $quality_value_2 = 'I'x(length$sequence_2);
2128 }
2129
2130
2131 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
2132
2133
2134 my %alignments;
2135 my $alignment_ambiguous = 0;
2136
2137 ### reading from the Bowtie 2 output filehandles
2138
2139 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
2140 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
2141 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
2142 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
2143 ### strands are not being reported when '--directional' is specified
2144
2145 foreach my $index (0,3,1,2){
2146 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
2147 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
2148
2149 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
2150 if ($fhs[$index]->{last_seq_id} eq $identifier) {
2151
2152 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
2153 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
2154 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
2155 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
2156 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
2157 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
2158 $id_1 =~ s/\/1$//;
2159 $id_2 =~ s/\/2$//;
2160
2161 # SAM format specifications for Bowtie 2
2162 # (1) Name of read that aligned
2163 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
2164 # 1 The read is one of a pair
2165 # 2 The alignment is one end of a proper paired-end alignment
2166 # 4 The read has no reported alignments
2167 # 8 The read is one of a pair and has no reported alignments
2168 # 16 The alignment is to the reverse reference strand
2169 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
2170 # 64 The read is mate 1 in a pair
2171 # 128 The read is mate 2 in a pair
2172 # 256 The read has multiple mapping states
2173 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
2174 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
2175 # (5) Mapping quality (255 means MAPQ is not available)
2176 # (6) CIGAR string representation of alignment (* if unavailable)
2177 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
2178 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
2179 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
2180 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
2181 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
2182 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
2183 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
2184 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
2185 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
2186 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
2187 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
2188 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2189 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
2190 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
2191 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
2192 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
2193
2194 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
2195 ### We can store the next alignment and move on to the next Bowtie 2 instance
2196 if ($flag_1 == 77 and $flag_2 == 141){
2197 ## reading in the next alignment, which must be the next sequence
2198 my $newline_1 = $fhs[$index]->{fh}-> getline();
2199 my $newline_2 = $fhs[$index]->{fh}-> getline();
2200
2201 if ($newline_1 and $newline_2){
2202 chomp $newline_1;
2203 chomp $newline_2;
2204 my ($seq_id_1) = split (/\t/,$newline_1);
2205 my ($seq_id_2) = split (/\t/,$newline_2);
2206 $seq_id_1 =~ s/\/1$//;
2207 $seq_id_2 =~ s/\/2$//;
2208 $fhs[$index]->{last_seq_id} = $seq_id_1;
2209 $fhs[$index]->{last_line_1} = $newline_1;
2210 $fhs[$index]->{last_line_2} = $newline_2;
2211
2212 # print "current sequence ($identifier) did not map, reading in next sequence\n";
2213 # print "$index\t$fhs[$index]->{last_seq_id}\n";
2214 # print "$index\t$fhs[$index]->{last_line_1}\n";
2215 # print "$index\t$fhs[$index]->{last_line_2}\n";
2216 next; # next instance
2217 }
2218 else{
2219 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2220 $fhs[$index]->{last_seq_id} = undef;
2221 $fhs[$index]->{last_line_1} = undef;
2222 $fhs[$index]->{last_line_2} = undef;
2223 next;
2224 }
2225 }
2226
2227 ### If there are one or more proper alignments we can extract the chromosome number
2228 my ($chromosome_1,$chromosome_2);
2229 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
2230 $chromosome_1 = $mapped_chromosome_1;
2231 }
2232 else{
2233 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
2234 }
2235 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
2236 $chromosome_2 = $mapped_chromosome_2;
2237 }
2238 else{
2239 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
2240 }
2241
2242 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
2243
2244 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
2245 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
2246
2247 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
2248 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
2249
2250 foreach (11..$#fields_1){
2251 if ($fields_1[$_] =~ /AS:i:(.*)/){
2252 $alignment_score_1 = $1;
2253 }
2254 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
2255 $second_best_1 = $1;
2256 }
2257 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
2258 $MD_tag_1 = $1;
2259 }
2260 }
2261
2262 foreach (11..$#fields_2){
2263 if ($fields_2[$_] =~ /AS:i:(.*)/){
2264 $alignment_score_2 = $1;
2265 }
2266 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
2267 $second_best_2 = $1;
2268 }
2269 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
2270 $MD_tag_2 = $1;
2271 }
2272 }
2273
2274 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
2275 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
2276
2277 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
2278 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
2279 # warn "MD tag 1 is: '$MD_tag_1'\n";
2280 # warn "MD tag 2 is: '$MD_tag_2'\n";
2281
2282 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
2283 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
2284 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
2285
2286 if (defined $second_best_1 and defined $second_best_2){
2287 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
2288 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
2289 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
2290 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
2291
2292 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
2293 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
2294 $alignment_ambiguous = 1;
2295 # print "This read will be chucked (AS==XS detected)!\n";
2296
2297 ## need to read and discard all additional ambiguous reads until we reach the next sequence
2298 until ($fhs[$index]->{last_seq_id} ne $identifier){
2299 my $newline_1 = $fhs[$index]->{fh}-> getline();
2300 my $newline_2 = $fhs[$index]->{fh}-> getline();
2301 if ($newline_1 and $newline_2){
2302 chomp $newline_1;
2303 chomp $newline_2;
2304 my ($seq_id_1) = split (/\t/,$newline_1);
2305 my ($seq_id_2) = split (/\t/,$newline_2);
2306 $seq_id_1 =~ s/\/1$//;
2307 $seq_id_2 =~ s/\/2$//;
2308 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2309
2310 $fhs[$index]->{last_seq_id} = $seq_id_1;
2311 $fhs[$index]->{last_line_1} = $newline_1;
2312 $fhs[$index]->{last_line_2} = $newline_2;
2313 }
2314 else{
2315 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
2316 $fhs[$index]->{last_seq_id} = undef;
2317 $fhs[$index]->{last_line_1} = undef;
2318 $fhs[$index]->{last_line_2} = undef;
2319 last; # break free if the end of the alignment output was reached
2320 }
2321 }
2322 # if ($fhs[$index]->{last_seq_id}){
2323 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
2324 # }
2325 }
2326 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
2327
2328 my $alignment_location;
2329 if ($position_1 <= $position_2){
2330 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2331 }
2332 elsif($position_2 < $position_1){
2333 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
2334 }
2335
2336 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2337 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2338 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2339 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2340
2341 unless (exists $alignments{$alignment_location}){
2342 $alignments{$alignment_location}->{seq_id} = $id_1;
2343 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2344 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2345 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2346 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2347 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2348 $alignments{$alignment_location}->{index} = $index;
2349 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2350 $alignments{$alignment_location}->{position_1} = $position_1;
2351 $alignments{$alignment_location}->{position_2} = $position_2;
2352 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2353 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2354 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2355 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2356 $alignments{$alignment_location}->{flag_1} = $flag_1;
2357 $alignments{$alignment_location}->{flag_2} = $flag_2;
2358 }
2359 # warn "added best of several alignments to \%alignments hash\n";
2360
2361 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
2362 until ($fhs[$index]->{last_seq_id} ne $identifier){
2363 my $newline_1 = $fhs[$index]->{fh}-> getline();
2364 my $newline_2 = $fhs[$index]->{fh}-> getline();
2365 if ($newline_1 and $newline_2){
2366 chomp $newline_1;
2367 chomp $newline_2;
2368 my ($seq_id_1) = split (/\t/,$newline_1);
2369 my ($seq_id_2) = split (/\t/,$newline_2);
2370 $seq_id_1 =~ s/\/1$//;
2371 $seq_id_2 =~ s/\/2$//;
2372 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2373
2374 $fhs[$index]->{last_seq_id} = $seq_id_1;
2375 $fhs[$index]->{last_line_1} = $newline_1;
2376 $fhs[$index]->{last_line_2} = $newline_2;
2377 }
2378 else{
2379 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2380 $fhs[$index]->{last_seq_id} = undef;
2381 $fhs[$index]->{last_line_1} = undef;
2382 $fhs[$index]->{last_line_2} = undef;
2383 last; # break free if the end of the alignment output was reached
2384 }
2385 }
2386 # if($fhs[$index]->{last_seq_id}){
2387 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
2388 # }
2389 }
2390 }
2391 else{ # there is no second best hit, so we can just store this one and read in the next sequence
2392
2393 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
2394 # print "$alignment_location\n";
2395 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
2396 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
2397 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
2398 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
2399
2400 unless (exists $alignments{$alignment_location}){
2401 $alignments{$alignment_location}->{seq_id} = $id_1;
2402 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
2403 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
2404 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
2405 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
2406 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
2407 $alignments{$alignment_location}->{index} = $index;
2408 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
2409 $alignments{$alignment_location}->{position_1} = $position_1;
2410 $alignments{$alignment_location}->{position_2} = $position_2;
2411 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
2412 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
2413 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
2414 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
2415 $alignments{$alignment_location}->{flag_1} = $flag_1;
2416 $alignments{$alignment_location}->{flag_2} = $flag_2;
2417 }
2418
2419 # warn "added unique alignment to \%alignments hash\n";
2420
2421 # Now reading and storing the next read pair
2422 my $newline_1 = $fhs[$index]->{fh}-> getline();
2423 my $newline_2 = $fhs[$index]->{fh}-> getline();
2424 if ($newline_1 and $newline_2){
2425 chomp $newline_1;
2426 chomp $newline_2;
2427 # print "$newline_1\n";
2428 # print "$newline_2\n";
2429 my ($seq_id_1) = split (/\t/,$newline_1);
2430 my ($seq_id_2) = split (/\t/,$newline_2);
2431 $seq_id_1 =~ s/\/1$//;
2432 $seq_id_2 =~ s/\/2$//;
2433 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
2434
2435 $fhs[$index]->{last_seq_id} = $seq_id_1;
2436 $fhs[$index]->{last_line_1} = $newline_1;
2437 $fhs[$index]->{last_line_2} = $newline_2;
2438
2439 if ($seq_id_1 eq $identifier){
2440 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
2441 }
2442 }
2443 else{
2444 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
2445 $fhs[$index]->{last_seq_id} = undef;
2446 $fhs[$index]->{last_line_1} = undef;
2447 $fhs[$index]->{last_line_2} = undef;
2448 }
2449 }
2450 }
2451 }
2452
2453 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
2454 if ($alignment_ambiguous == 1){
2455 $counting{unsuitable_sequence_count}++;
2456 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2457 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2458 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2459 # print "$ambiguous_read_1\n";
2460 # print "$ambiguous_read_2\n";
2461
2462 if ($ambiguous){
2463 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
2464 }
2465 elsif ($unmapped){
2466 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
2467 }
2468 else{
2469 return 0;
2470 }
2471 }
2472
2473 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
2474 unless (%alignments){
2475 $counting{no_single_alignment_found}++;
2476
2477 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2478 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2479 # print "$unmapped_read_1\n";
2480 # print "$unmapped_read_2\n";
2481 if ($unmapped){
2482 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
2483 }
2484 else{
2485 return 0;
2486 }
2487 }
2488
2489 #######################################################################################################################################################
2490
2491 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
2492 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
2493 ### alignment score we are discarding the sequence pair altogether.
2494 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
2495 ### and extending (3 per bp) the gap.
2496
2497 #######################################################################################################################################################
2498
2499 ### Declaring an empty hash reference which will store all information we need for the methylation call
2500 my $methylation_call_params; # hash reference
2501 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
2502
2503 ### print contents of %alignments for debugging
2504 ## if (scalar keys %alignments >= 1){
2505 # print "\n******\n";
2506 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
2507 # print "Loc: $alignment_location\n";
2508 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
2509 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
2510 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
2511 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
2512 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
2513 # print "Index $alignments{$alignment_location}->{index}\n";
2514 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
2515 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
2516 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
2517 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
2518 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
2519 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
2520 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
2521 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
2522 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
2523 # }
2524 # print "\n******\n";
2525 # }
2526
2527 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
2528 if (scalar keys %alignments == 1){
2529 for my $unique_best_alignment (keys %alignments){
2530 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
2531 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
2532 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
2533 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
2534 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
2535 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
2536 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
2537 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
2538 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
2539 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
2540 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
2541 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
2542 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
2543 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
2544 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
2545 }
2546 }
2547
2548 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
2549 ### we boot the sequence pair altogether)
2550 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
2551 my $best_sum_of_alignment_scores;
2552 my $best_alignment_location;
2553 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
2554 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
2555 unless (defined $best_sum_of_alignment_scores){
2556 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
2557 $best_alignment_location = $alignment_location;
2558 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
2559 }
2560 else{
2561 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
2562 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
2563 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
2564 $sequence_pair_fails = 1;
2565 last; # exiting since we know that the sequence has ambiguous alignments
2566 }
2567 ### else we are going to store the best alignment for further processing
2568 else{
2569 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
2570 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
2571 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
2572 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
2573 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
2574 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
2575 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
2576 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
2577 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
2578 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
2579 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
2580 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
2581 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
2582 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
2583 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
2584 last; # exiting since the sequence produced a unique best alignment
2585 }
2586 }
2587 }
2588 }
2589 else{
2590 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
2591 }
2592
2593 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
2594 if ($sequence_pair_fails == 1){
2595 $counting{unsuitable_sequence_count}++;
2596
2597 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
2598 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
2599 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
2600 # print "$ambiguous_read_1\n";
2601 # print "$ambiguous_read_2\n";
2602
2603 if ($ambiguous){
2604 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
2605 }
2606 elsif ($unmapped){
2607 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
2608 }
2609 else{
2610 return 0; # => exits to next sequence pair (default)
2611 }
2612 }
2613
2614 ### --DIRECTIONAL
2615 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
2616 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
2617 if ($directional){
2618 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
2619 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
2620 $counting{alignments_rejected_count}++;
2621 return 0;
2622 }
2623 }
2624
2625 ### If the sequence pair has not been rejected so far it does have a unique best alignment
2626 $counting{unique_best_alignment_count}++;
2627 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
2628
2629 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
2630 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
2631 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
2632 $counting{genomic_sequence_could_not_be_extracted_count}++;
2633 return 0;
2634 }
2635 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
2636 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
2637 $counting{genomic_sequence_could_not_be_extracted_count}++;
2638 return 0;
2639 }
2640
2641 ### now we are set to perform the actual methylation call
2642 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
2643 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
2644 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
2645 # print " $sequence_2\n";
2646 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
2647 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
2648
2649 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
2650 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
2651 }
2652
2653 ###
2654
2655 sub decide_whether_paired_end_alignment_is_valid{
2656 my ($index,$identifier) = @_;
2657 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
2658 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
2659 chomp $mismatch_info_1;
2660 chomp $mismatch_info_2;
2661 my $seq_id_1 = $id_1;
2662 my $seq_id_2 = $id_2;
2663 $seq_id_1 =~ s/\/1$//; # removing the read /1
2664 $seq_id_2 =~ s/\/1$//; # removing the read /1
2665
2666 ### ensuring that the current entry is the correct sequence
2667 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
2668 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
2669 ### sensible alignments
2670 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
2671 ### If the orientation was correct can we move on
2672 if ($orientation == 1){
2673 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
2674 }
2675 ### If the alignment was in the wrong orientation we need to read in two new lines
2676 elsif($orientation == 0){
2677 my $newline_1 = $fhs[$index]->{fh}->getline();
2678 my $newline_2 = $fhs[$index]->{fh}->getline();
2679 if ($newline_1 and $newline_2){
2680 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
2681 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
2682 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
2683
2684 my $seqid;
2685 $seq_id_1 = $id_1;
2686 $seq_id_2 = $id_2;
2687 # we need to capture the first read (ending on /1)
2688 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2689 $seqid = $seq_id_1;
2690 }
2691 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2692 $seqid = $seq_id_2;
2693 }
2694 else{
2695 die "One of the two reads needs to end on /1!!";
2696 }
2697
2698 ### ensuring that the next entry is still the correct sequence
2699 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
2700 ### checking orientation again
2701 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
2702 ### If the orientation was correct can we move on
2703 if ($orientation == 1){
2704 ### Writing the current sequence to last_line_1 and last_line_2
2705 $fhs[$index]->{last_seq_id} = $seqid;
2706 $fhs[$index]->{last_line_1} = $newline_1;
2707 $fhs[$index]->{last_line_2} = $newline_2;
2708 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
2709 }
2710 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
2711 ### the next entry)
2712 elsif ($orientation == 0){
2713 $newline_1 = $fhs[$index]->{fh}->getline();
2714 $newline_2 = $fhs[$index]->{fh}->getline();
2715 if ($newline_1 and $newline_2){
2716 ($seq_id_1) = split (/\t/,$newline_1);
2717 ($seq_id_2) = split (/\t/,$newline_2);
2718
2719 $seqid = '';
2720 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
2721 $seqid = $seq_id_1;
2722 }
2723 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
2724 $seqid = $seq_id_2;
2725 }
2726 else{
2727 die "One of the two reads needs to end on /1!!";
2728 }
2729
2730 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
2731 ### the same fields of the just read next entry
2732 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
2733 $fhs[$index]->{last_seq_id} = $seqid;
2734 $fhs[$index]->{last_line_1} = $newline_1;
2735 $fhs[$index]->{last_line_2} = $newline_2;
2736 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2737 }
2738 else {
2739 ### assigning undef to last_seq_id and last_line (end of bowtie output)
2740 $fhs[$index]->{last_seq_id} = undef;
2741 $fhs[$index]->{last_line_1} = undef;
2742 $fhs[$index]->{last_line_2} = undef;
2743 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2744 }
2745 }
2746 else{
2747 die "The orientation of the alignment must be either correct or incorrect\n";
2748 }
2749 }
2750 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
2751 else{
2752 $fhs[$index]->{last_seq_id} = $seqid;
2753 $fhs[$index]->{last_line_1} = $newline_1;
2754 $fhs[$index]->{last_line_2} = $newline_2;
2755 return 0; # processing the new alignment result only in the next round
2756 }
2757 }
2758 else {
2759 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
2760 $fhs[$index]->{last_seq_id} = undef;
2761 $fhs[$index]->{last_line_1} = undef;
2762 $fhs[$index]->{last_line_2} = undef;
2763 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
2764 }
2765 }
2766 else{
2767 die "The orientation of the alignment must be either correct or incorrect\n";
2768 }
2769 }
2770 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
2771 else{
2772 return 0;
2773 }
2774 }
2775
2776 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
2777
2778 sub extract_corresponding_genomic_sequence_paired_ends {
2779 my ($sequence_identifier,$methylation_call_params) = @_;
2780 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
2781 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
2782 my $alignment_read_1;
2783 my $alignment_read_2;
2784 my $read_conversion_info_1;
2785 my $read_conversion_info_2;
2786 my $genome_conversion;
2787
2788 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
2789 ### if the C happens to be at the first or last position of the actually observed sequence
2790 my $non_bisulfite_sequence_1;
2791 my $non_bisulfite_sequence_2;
2792
2793 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
2794 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
2795 ### sequences around!
2796 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
2797 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
2798 ### [Index 0, sequence originated from (converted) forward strand]
2799 $counting{CT_GA_CT_count}++;
2800 $alignment_read_1 = '+';
2801 $alignment_read_2 = '-';
2802 $read_conversion_info_1 = 'CT';
2803 $read_conversion_info_2 = 'GA';
2804 $genome_conversion = 'CT';
2805 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
2806 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
2807
2808 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
2809
2810 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
2811 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
2812 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
2813
2814 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
2815 ### the reverse strand sequence needs to be reverse complemented
2816 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2817 }
2818 else{
2819 $non_bisulfite_sequence_2 = '';
2820 }
2821 }
2822
2823 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
2824 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
2825 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
2826 $counting{GA_CT_GA_count}++;
2827 $alignment_read_1 = '+';
2828 $alignment_read_2 = '-';
2829 $read_conversion_info_1 = 'GA';
2830 $read_conversion_info_2 = 'CT';
2831 $genome_conversion = 'GA';
2832
2833 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
2834 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
2835 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
2836 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
2837 }
2838 else{
2839 $non_bisulfite_sequence_1 = '';
2840 }
2841
2842 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
2843 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
2844 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
2845 ### the reverse strand sequence needs to be reverse complemented
2846 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
2847 }
2848
2849 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
2850 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
2851 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
2852 $counting{GA_CT_CT_count}++;
2853 $alignment_read_1 = '-';
2854 $alignment_read_2 = '+';
2855 $read_conversion_info_1 = 'GA';
2856 $read_conversion_info_2 = 'CT';
2857 $genome_conversion = 'CT';
2858
2859 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
2860 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
2861 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
2862 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
2863 ### the reverse strand sequence needs to be reverse complemented
2864 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
2865
2866 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
2867 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
2868 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
2869 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
2870 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
2871 }
2872 else{
2873 $non_bisulfite_sequence_2 = '';
2874 }
2875 }
2876
2877 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
2878 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
2879 ### [Index 3, sequence originated from the (converted) reverse strand]
2880 $counting{CT_GA_GA_count}++;
2881 $alignment_read_1 = '-';
2882 $alignment_read_2 = '+';
2883 $read_conversion_info_1 = 'CT';
2884 $read_conversion_info_2 = 'GA';
2885 $genome_conversion = 'GA';
2886
2887 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
2888 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
2889 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
2890 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
2891 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
2892 ### the reverse strand sequence needs to be reverse complemented
2893 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
2894 }
2895 else{
2896 $non_bisulfite_sequence_1 = '';
2897 }
2898
2899 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
2900 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
2901 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
2902 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
2903 }
2904 else{
2905 die "Too many bowtie result filehandles\n";
2906 }
2907 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
2908 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
2909
2910 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
2911 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
2912 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
2913 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
2914 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
2915 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
2916 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
2917 }
2918
2919 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
2920
2921 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
2922 my ($sequence_identifier,$methylation_call_params) = @_;
2923 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
2924 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
2925
2926 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
2927 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
2928 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
2929 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
2930 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
2931 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
2932 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
2933
2934 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
2935 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
2936 my $alignment_read_1;
2937 my $alignment_read_2;
2938 my $read_conversion_info_1;
2939 my $read_conversion_info_2;
2940 my $genome_conversion;
2941
2942 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
2943 ### if the C happens to be at the last position of the actually observed sequence
2944 my $non_bisulfite_sequence_1 = '';
2945 my $non_bisulfite_sequence_2 = '';
2946
2947 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
2948 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
2949 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
2950
2951 # parsing CIGAR 1 string
2952 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
2953 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
2954 shift @ops_1; # remove the empty first element
2955 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
2956 # parsing CIGAR 2 string
2957 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
2958 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
2959 shift @ops_2; # remove the empty first element
2960 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
2961
2962 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
2963 my $indels_2 = 0;
2964
2965 ### Extracting read 1 genomic sequence ###
2966
2967 # extracting 2 additional bp at the 5' end (read 1)
2968 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
2969 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
2970 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
2971 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
2972 return;
2973 }
2974 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
2975 }
2976
2977 foreach (0..$#len_1){
2978 if ($ops_1[$_] eq 'M'){
2979 # extracting genomic sequence
2980 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
2981 # warn "$non_bisulfite_sequence_1\n";
2982 # adjusting position
2983 $pos_1 += $len_1[$_];
2984 }
2985 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
2986 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
2987 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
2988 # warn "$non_bisulfite_sequence_1\n";
2989 # position doesn't need adjusting
2990 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
2991 }
2992 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
2993 # we do not add any genomic sequence but only adjust the position
2994 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
2995 $pos_1 += $len_1[$_];
2996 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
2997 }
2998 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
2999 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
3000 }
3001 else{
3002 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
3003 }
3004 }
3005
3006 ### 3' end of read 1
3007 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3008 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3009 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
3010 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3011 return;
3012 }
3013 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
3014 }
3015
3016
3017 ### Extracting read 2 genomic sequence ###
3018
3019 ### 5' end of read 2
3020 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3021 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3022 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
3023 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3024 return;
3025 }
3026 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
3027 }
3028
3029 foreach (0..$#len_2){
3030 if ($ops_2[$_] eq 'M'){
3031 # extracting genomic sequence
3032 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
3033 # warn "$non_bisulfite_sequence_2\n";
3034 # adjusting position
3035 $pos_2 += $len_2[$_];
3036 }
3037 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
3038 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3039 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
3040 # warn "$non_bisulfite_sequence_2\n";
3041 # position doesn't need adjusting
3042 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3043 }
3044 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
3045 # we do not add any genomic sequence but only adjust the position
3046 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
3047 $pos_2 += $len_2[$_];
3048 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
3049 }
3050 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
3051 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
3052 }
3053 else{
3054 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
3055 }
3056 }
3057
3058 ### 3' end of read 2
3059 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3060 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3061 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
3062 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3063 return;
3064 }
3065 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
3066 }
3067
3068 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
3069 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
3070
3071 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
3072 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3073 ### [Index 0, sequence originated from (converted) forward strand]
3074 $counting{CT_GA_CT_count}++;
3075 $alignment_read_1 = '+';
3076 $alignment_read_2 = '-';
3077 $read_conversion_info_1 = 'CT';
3078 $read_conversion_info_2 = 'GA';
3079 $genome_conversion = 'CT';
3080 ### Read 1 is always the forward hit
3081 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
3082 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3083 }
3084
3085 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
3086 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3087 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
3088 $counting{GA_CT_GA_count}++;
3089 $alignment_read_1 = '+';
3090 $alignment_read_2 = '-';
3091 $read_conversion_info_1 = 'GA';
3092 $read_conversion_info_2 = 'CT';
3093 $genome_conversion = 'GA';
3094 ### Read 1 is always the forward hit
3095 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
3096 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
3097 }
3098
3099 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
3100 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3101 ### [Index 2, sequence originated from the complementary to (converted) top strand]
3102 $counting{GA_CT_CT_count}++;
3103 $alignment_read_1 = '-';
3104 $alignment_read_2 = '+';
3105 $read_conversion_info_1 = 'GA';
3106 $read_conversion_info_2 = 'CT';
3107 $genome_conversion = 'CT';
3108
3109 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
3110 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3111 }
3112
3113 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
3114 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3115 ### [Index 3, sequence originated from the (converted) reverse strand]
3116 $counting{CT_GA_GA_count}++;
3117 $alignment_read_1 = '-';
3118 $alignment_read_2 = '+';
3119 $read_conversion_info_1 = 'CT';
3120 $read_conversion_info_2 = 'GA';
3121 $genome_conversion = 'GA';
3122 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
3123 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
3124 }
3125 else{
3126 die "Too many bowtie result filehandles\n";
3127 }
3128 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3129 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3130
3131 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
3132 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
3133 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3134 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
3135 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
3136 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
3137 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
3138 ## the end position of a read is stored in $pos
3139 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
3140 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
3141 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
3142 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
3143 }
3144
3145 ##########################################
3146 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
3147 ##########################################
3148
3149 sub print_bisulfite_mapping_result_single_end{
3150 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
3151
3152 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3153 if ($phred64){
3154 $quality_value = convert_phred64_quals_to_phred33($quality_value);
3155 }
3156 elsif ($solexa){
3157 $quality_value = convert_solexa_quals_to_phred33($quality_value);
3158 }
3159
3160 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
3161 $methylation_call_params->{$identifier}->{position} += 1;
3162
3163 ### writing every uniquely mapped read and its methylation call to the output file
3164 if ($vanilla){
3165 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
3166 print OUT "$bowtie1_output\n";
3167 }
3168 else{ # SAM output, default since Bismark v1.0.0
3169 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
3170 }
3171 }
3172
3173 ##########################################
3174 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
3175 ##########################################
3176
3177 sub print_bisulfite_mapping_result_single_end_bowtie2{
3178 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
3179
3180 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3181 if ($phred64){
3182 $quality_value = convert_phred64_quals_to_phred33($quality_value);
3183 }
3184 elsif ($solexa){
3185 $quality_value = convert_solexa_quals_to_phred33($quality_value);
3186 }
3187
3188 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
3189 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
3190 }
3191
3192 ##########################################
3193 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
3194 ##########################################
3195
3196 sub print_bisulfite_mapping_results_paired_ends{
3197 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
3198
3199 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3200 if ($phred64){
3201 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
3202 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
3203 }
3204 elsif ($solexa){
3205 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
3206 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
3207 }
3208
3209 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
3210 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
3211
3212 ### writing every single aligned read and its methylation call to the output file
3213 if ($vanilla){
3214 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
3215 print OUT "$bowtie1_output_paired_end\n";
3216 }
3217 else{ # SAM output, default since Bismark v1.0.0
3218 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
3219 }
3220
3221 }
3222
3223 ##########################################
3224 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
3225 ##########################################
3226
3227 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
3228 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
3229
3230 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
3231 if ($phred64){
3232 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
3233 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
3234 }
3235 elsif ($solexa){
3236 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
3237 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
3238 }
3239
3240 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
3241 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
3242
3243 }
3244
3245
3246 sub convert_phred64_quals_to_phred33{
3247
3248 my $qual = shift;
3249 my @quals = split (//,$qual);
3250 my @new_quals;
3251
3252 foreach my $index (0..$#quals){
3253 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
3254 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
3255 $new_quals[$index] = $phred33_quality_string;
3256 }
3257
3258 my $phred33_quality = join ("",@new_quals);
3259 return $phred33_quality;
3260 }
3261
3262 sub convert_solexa_quals_to_phred33{
3263
3264 my $qual = shift;
3265 my @quals = split (//,$qual);
3266 my @new_quals;
3267
3268 foreach my $index (0..$#quals){
3269 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
3270 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
3271 $new_quals[$index] = $phred33_quality_string;
3272 }
3273
3274 my $phred33_quality = join ("",@new_quals);
3275 return $phred33_quality;
3276 }
3277
3278 sub convert_phred_score_into_phred33_quality_string{
3279 my $qual = shift;
3280 $qual = chr($qual+33);
3281 return $qual;
3282 }
3283
3284 sub convert_phred64_quality_string_into_phred_score{
3285 my $string = shift;
3286 my $qual = ord($string)-64;
3287 return $qual;
3288 }
3289
3290 sub convert_solexa_pre1_3_quality_string_into_phred_score{
3291 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
3292 my $string = shift;
3293 my $qual = ord($string)-59;
3294 return $qual;
3295 }
3296
3297
3298 sub extract_corresponding_genomic_sequence_single_end {
3299 my ($sequence_identifier,$methylation_call_params) = @_;
3300 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3301 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3302
3303 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3304 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3305 my $alignment_strand;
3306 my $read_conversion_info;
3307 my $genome_conversion;
3308 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
3309 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
3310 ### if the C happens to be at the last position of the actually observed sequence
3311 my $non_bisulfite_sequence;
3312 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
3313
3314 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
3315 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3316 ### [Index 0, sequence originated from (converted) forward strand]
3317 $counting{CT_CT_count}++;
3318 $alignment_strand = '+';
3319 $read_conversion_info = 'CT';
3320 $genome_conversion = 'CT';
3321
3322 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3323 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
3324 ### + 2 extra base at the 3' end
3325 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3326 }
3327 else{
3328 $non_bisulfite_sequence = '';
3329 }
3330 }
3331
3332 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
3333 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3334 ### [Index 1, sequence originated from (converted) reverse strand]
3335 $counting{CT_GA_count}++;
3336 $alignment_strand = '-';
3337 $read_conversion_info = 'CT';
3338 $genome_conversion = 'GA';
3339
3340 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3341 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
3342 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
3343 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3344 ## reverse complement!
3345 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3346 }
3347 else{
3348 $non_bisulfite_sequence = '';
3349 }
3350 }
3351
3352 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
3353 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3354 ### [Index 2, sequence originated from complementary to (converted) forward strand]
3355 $counting{GA_CT_count}++;
3356 $alignment_strand = '-';
3357 $read_conversion_info = 'GA';
3358 $genome_conversion = 'CT';
3359
3360 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
3361 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3362 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
3363 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
3364 ## reverse complement!
3365 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3366 }
3367 else{
3368 $non_bisulfite_sequence = '';
3369 }
3370 }
3371
3372 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
3373 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3374 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
3375 $counting{GA_GA_count}++;
3376 $alignment_strand = '+';
3377 $read_conversion_info = 'GA';
3378 $genome_conversion = 'GA';
3379
3380 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3381 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
3382 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
3383 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
3384 }
3385 else{
3386 $non_bisulfite_sequence = '';
3387 }
3388 }
3389 else{
3390 die "Too many bowtie result filehandles\n";
3391 }
3392
3393 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
3394 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
3395 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3396 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3397
3398 ### at this point we can also determine the end position of a read
3399 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
3400 }
3401
3402
3403 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
3404 my ($sequence_identifier,$methylation_call_params) = @_;
3405
3406 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
3407 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
3408
3409 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
3410 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
3411
3412 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
3413 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
3414 my $alignment_strand;
3415 my $read_conversion_info;
3416 my $genome_conversion;
3417 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
3418 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
3419 my $non_bisulfite_sequence = '';
3420
3421 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
3422 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
3423
3424 # parsing CIGAR string
3425 my @len = split (/\D+/,$cigar); # storing the length per operation
3426 my @ops = split (/\d+/,$cigar); # storing the operation
3427 shift @ops; # remove the empty first element
3428 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
3429
3430 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
3431 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
3432 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3433 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
3434 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3435 return;
3436 }
3437 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
3438 }
3439 my $indels = 0;
3440
3441 foreach (0..$#len){
3442 if ($ops[$_] eq 'M'){
3443 #extracting genomic sequence
3444 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
3445 # adjusting position
3446 $pos += $len[$_];
3447 }
3448 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
3449 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
3450 $non_bisulfite_sequence .= 'N' x $len[$_];
3451 # warn "$non_bisulfite_sequence\n";
3452 # position doesn't need to be adjusting
3453 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
3454 }
3455 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
3456 # we do not add any genomic sequence but only adjust the position
3457 $pos += $len[$_];
3458 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
3459 }
3460 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
3461 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
3462 }
3463 else{
3464 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
3465 }
3466 }
3467
3468 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
3469 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
3470 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
3471 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
3472 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3473 return;
3474 }
3475 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
3476 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
3477 }
3478
3479
3480
3481 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
3482 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
3483 ### [Index 0, sequence originated from (converted) forward strand]
3484 $counting{CT_CT_count}++;
3485 $alignment_strand = '+';
3486 $read_conversion_info = 'CT';
3487 $genome_conversion = 'CT';
3488 }
3489
3490 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
3491 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
3492 ### [Index 1, sequence originated from (converted) reverse strand]
3493 $counting{CT_GA_count}++;
3494 $alignment_strand = '-';
3495 $read_conversion_info = 'CT';
3496 $genome_conversion = 'GA';
3497
3498 ### reverse complement!
3499 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3500 }
3501
3502 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
3503 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
3504 ### [Index 2, sequence originated from complementary to (converted) forward strand]
3505 $counting{GA_CT_count}++;
3506 $alignment_strand = '-';
3507 $read_conversion_info = 'GA';
3508 $genome_conversion = 'CT';
3509
3510 ### reverse complement!
3511 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
3512 }
3513
3514 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
3515 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
3516 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
3517 $counting{GA_GA_count}++;
3518 $alignment_strand = '+';
3519 $read_conversion_info = 'GA';
3520 $genome_conversion = 'GA';
3521
3522 }
3523 else{
3524 die "Too many Bowtie 2 result filehandles\n";
3525 }
3526
3527 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
3528 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
3529 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
3530 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
3531
3532 ### the end position of a read is stored in $pos
3533 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
3534 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
3535 }
3536
3537 ### METHYLATION CALL
3538
3539 sub methylation_call{
3540 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
3541 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
3542 my @seq = split(//,$sequence_actually_observed);
3543 my @genomic = split(//,$genomic_sequence);
3544 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
3545 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
3546 ### CpG, CHH or CHG context
3547
3548 #################################################################
3549 ### . for bases not involving cytosines ###
3550 ### X for methylated C in CHG context (was protected) ###
3551 ### x for not methylated C in CHG context (was converted) ###
3552 ### H for methylated C in CHH context (was protected) ###
3553 ### h for not methylated C in CHH context (was converted) ###
3554 ### Z for methylated C in CpG context (was protected) ###
3555 ### z for not methylated C in CpG context (was converted) ###
3556 #################################################################
3557
3558 my @match =();
3559 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
3560 my $methyl_CHH_count = 0;
3561 my $methyl_CHG_count = 0;
3562 my $methyl_CpG_count = 0;
3563 my $unmethylated_CHH_count = 0;
3564 my $unmethylated_CHG_count = 0;
3565 my $unmethylated_CpG_count = 0;
3566
3567 if ($read_conversion eq 'CT'){
3568 for my $index (0..$#seq) {
3569 if ($seq[$index] eq $genomic[$index]) {
3570 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
3571 if ($genomic[$index] eq 'C') {
3572 ### If the residue is a C we want to know if it was in CpG context or in any other context
3573 my $downstream_base = $genomic[$index+1];
3574
3575 if ($downstream_base eq 'G'){
3576 ++$methyl_CpG_count;
3577 push @match,'Z'; # protected C, methylated, in CpG context
3578 }
3579
3580 else {
3581 ### C in not in CpG-context, determining the second downstream base context
3582 my $second_downstream_base = $genomic[$index+2];
3583
3584 if ($second_downstream_base eq 'G'){
3585 ++$methyl_CHG_count;
3586 push @match,'X'; # protected C, methylated, in CHG context
3587 }
3588 else{
3589 ++$methyl_CHH_count;
3590 push @match,'H'; # protected C, methylated, in CHH context
3591 }
3592 }
3593 }
3594 else {
3595 push @match, '.';
3596 }
3597 }
3598 elsif ($seq[$index] ne $genomic[$index]) {
3599 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
3600 ### in the actually observed sequence
3601 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
3602 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
3603 my $downstream_base = $genomic[$index+1];
3604
3605 if ($downstream_base eq 'G'){
3606 ++$unmethylated_CpG_count;
3607 push @match,'z'; # converted C, not methylated, in CpG context
3608 }
3609
3610 else{
3611 ### C in not in CpG-context, determining the second downstream base context
3612 my $second_downstream_base = $genomic[$index+2];
3613
3614 if ($second_downstream_base eq 'G'){
3615 ++$unmethylated_CHG_count;
3616 push @match,'x'; # converted C, not methylated, in CHG context
3617 }
3618 else{
3619 ++$unmethylated_CHH_count;
3620 push @match,'h'; # converted C, not methylated, in CHH context
3621 }
3622 }
3623 }
3624 ### all other mismatches are not of interest for a methylation call
3625 else {
3626 push @match,'.';
3627 }
3628 }
3629 else{
3630 die "There can be only 2 possibilities\n";
3631 }
3632 }
3633 }
3634 elsif ($read_conversion eq 'GA'){
3635 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
3636
3637 for my $index (0..$#seq) {
3638 if ($seq[$index] eq $genomic[$index+2]) {
3639 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
3640 if ($genomic[$index+2] eq 'G') {
3641 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
3642 ### to look if the base upstream is a C
3643
3644 my $upstream_base = $genomic[$index+1];
3645
3646 if ($upstream_base eq 'C'){
3647 ++$methyl_CpG_count;
3648 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
3649 }
3650
3651 else{
3652 ### C in not in CpG-context, determining the second upstream base context
3653 my $second_upstream_base = $genomic[$index];
3654
3655 if ($second_upstream_base eq 'C'){
3656 ++$methyl_CHG_count;
3657 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
3658 }
3659 else{
3660 ++$methyl_CHH_count;
3661 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
3662 }
3663 }
3664 }
3665 else{
3666 push @match, '.';
3667 }
3668 }
3669 elsif ($seq[$index] ne $genomic[$index+2]) {
3670 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
3671 ### on the opposing strand, so G to A conversions in the actually observed sequence
3672 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
3673 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
3674 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
3675
3676 my $upstream_base = $genomic[$index+1];
3677
3678 if ($upstream_base eq 'C'){
3679 ++$unmethylated_CpG_count;
3680 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
3681 }
3682
3683 else{
3684 ### C in not in CpG-context, determining the second upstream base context
3685 my $second_upstream_base = $genomic[$index];
3686
3687 if ($second_upstream_base eq 'C'){
3688 ++$unmethylated_CHG_count;
3689 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
3690 }
3691 else{
3692 ++$unmethylated_CHH_count;
3693 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
3694 }
3695 }
3696 }
3697 ### all other mismatches are not of interest for a methylation call
3698 else {
3699 push @match,'.';
3700 }
3701 }
3702 else{
3703 die "There can be only 2 possibilities\n";
3704 }
3705 }
3706 }
3707 else{
3708 die "Strand conversion info is required to perform a methylation call\n";
3709 }
3710
3711 my $methylation_call = join ("",@match);
3712
3713 $counting{total_meCHH_count} += $methyl_CHH_count;
3714 $counting{total_meCHG_count} += $methyl_CHG_count;
3715 $counting{total_meCpG_count} += $methyl_CpG_count;
3716 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
3717 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
3718 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
3719
3720 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
3721 return $methylation_call;
3722 }
3723
3724 sub read_genome_into_memory{
3725 ## working directoy
3726 my $cwd = shift;
3727 ## reading in and storing the specified genome in the %chromosomes hash
3728 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
3729 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
3730
3731 my @chromosome_filenames = <*.fa>;
3732
3733 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
3734 unless (@chromosome_filenames){
3735 @chromosome_filenames = <*.fasta>;
3736 }
3737
3738 unless (@chromosome_filenames){
3739 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
3740 }
3741
3742 foreach my $chromosome_filename (@chromosome_filenames){
3743
3744 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
3745 ### first line needs to be a fastA header
3746 my $first_line = <CHR_IN>;
3747 chomp $first_line;
3748 $first_line =~ s/\r//;
3749
3750 ### Extracting chromosome name from the FastA header
3751 my $chromosome_name = extract_chromosome_name($first_line);
3752
3753 my $sequence;
3754 while (<CHR_IN>){
3755 chomp;
3756 $_ =~ s/\r//;
3757 if ($_ =~ /^>/){
3758 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
3759 if (exists $chromosomes{$chromosome_name}){
3760 print "chr $chromosome_name (",length $sequence ," bp)\n";
3761 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
3762 }
3763 else {
3764 if (length($sequence) == 0){
3765 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
3766 }
3767 print "chr $chromosome_name (",length $sequence ," bp)\n";
3768 $chromosomes{$chromosome_name} = $sequence;
3769 }
3770 ### resetting the sequence variable
3771 $sequence = '';
3772 ### setting new chromosome name
3773 $chromosome_name = extract_chromosome_name($_);
3774 }
3775 else{
3776 $sequence .= uc$_;
3777 }
3778 }
3779
3780 if (exists $chromosomes{$chromosome_name}){
3781 print "chr $chromosome_name (",length $sequence ," bp)\t";
3782 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
3783 }
3784 else{
3785 if (length($sequence) == 0){
3786 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
3787 }
3788 print "chr $chromosome_name (",length $sequence ," bp)\n";
3789 $chromosomes{$chromosome_name} = $sequence;
3790 }
3791 }
3792 print "\n";
3793 chdir $cwd or die "Failed to move to directory $cwd\n";
3794 }
3795
3796 sub extract_chromosome_name {
3797 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
3798 my $fasta_header = shift;
3799 if ($fasta_header =~ s/^>//){
3800 my ($chromosome_name) = split (/\s+/,$fasta_header);
3801 return $chromosome_name;
3802 }
3803 else{
3804 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
3805 }
3806 }
3807
3808 sub reverse_complement{
3809 my $sequence = shift;
3810 $sequence =~ tr/CATG/GTAC/;
3811 $sequence = reverse($sequence);
3812 return $sequence;
3813 }
3814
3815 sub biTransformFastAFiles {
3816 my $file = shift;
3817 my ($dir,$filename);
3818 if ($file =~ /\//){
3819 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
3820 }
3821 else{
3822 $filename = $file;
3823 }
3824
3825 ### gzipped version of the infile
3826 if ($file =~ /\.gz$/){
3827 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
3828 }
3829 else{
3830 open (IN,$file) or die "Couldn't read from file $file: $!\n";
3831 }
3832
3833 if ($skip){
3834 warn "Skipping the first $skip reads from $file\n";
3835 sleep (1);
3836 }
3837 if ($upto){
3838 warn "Processing reads up to sequence no. $upto from $file\n";
3839 sleep (1);
3840 }
3841
3842 my $C_to_T_infile = my $G_to_A_infile = $filename;
3843 $C_to_T_infile =~ s/$/_C_to_T.fa/;
3844 $G_to_A_infile =~ s/$/_G_to_A.fa/;
3845 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
3846 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
3847
3848 unless ($directional){
3849 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
3850 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
3851 }
3852
3853 my $count = 0;
3854 while (1){
3855 my $header = <IN>;
3856 my $sequence= <IN>;
3857 last unless ($header and $sequence);
3858
3859 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
3860
3861 ++$count;
3862
3863 if ($skip){
3864 next unless ($count > $skip);
3865 }
3866 if ($upto){
3867 last if ($count > $upto);
3868 }
3869
3870 $sequence = uc$sequence; # make input file case insensitive
3871
3872 # detecting if the input file contains tab stops, as this is likely to result in no alignments
3873 if (index($header,"\t") != -1){
3874 $seqID_contains_tabs++;
3875 }
3876
3877 ### small check if the sequence seems to be in FastA format
3878 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
3879
3880 my $sequence_C_to_T = $sequence;
3881 $sequence_C_to_T =~ tr/C/T/;
3882 print CTOT "$header$sequence_C_to_T";
3883
3884 unless ($directional){
3885 my $sequence_G_to_A = $sequence;
3886 $sequence_G_to_A =~ tr/G/A/;
3887 print GTOA "$header$sequence_G_to_A";
3888 }
3889 }
3890 if ($directional){
3891 print "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
3892 }
3893 else{
3894 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
3895 }
3896 return ($C_to_T_infile,$G_to_A_infile);
3897 }
3898
3899 sub biTransformFastAFiles_paired_end {
3900 my ($file,$read_number) = @_;
3901
3902 my ($dir,$filename);
3903 if ($file =~ /\//){
3904 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
3905 }
3906 else{
3907 $filename = $file;
3908 }
3909
3910 ### gzipped version of the infile
3911 if ($file =~ /\.gz$/){
3912 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
3913 }
3914 else{
3915 open (IN,$file) or die "Couldn't read from file $file: $!\n";
3916 }
3917
3918 if ($skip){
3919 warn "Skipping the first $skip reads from $file\n";
3920 sleep (1);
3921 }
3922 if ($upto){
3923 warn "Processing reads up to sequence no. $upto from $file\n";
3924 sleep (1);
3925 }
3926
3927 my $C_to_T_infile = my $G_to_A_infile = $filename;
3928 $C_to_T_infile =~ s/$/_C_to_T.fa/;
3929 $G_to_A_infile =~ s/$/_G_to_A.fa/;
3930
3931 if ($directional){
3932 if ($read_number == 1){
3933 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
3934 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
3935 }
3936 elsif ($read_number == 2){
3937 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
3938 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
3939 }
3940 else{
3941 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
3942 }
3943 }
3944 else{ # all four strand output
3945 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
3946 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
3947 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
3948 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
3949 }
3950
3951 my $count = 0;
3952
3953 while (1){
3954 my $header = <IN>;
3955 my $sequence= <IN>;
3956 last unless ($header and $sequence);
3957
3958 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
3959
3960 ++$count;
3961
3962 if ($skip){
3963 next unless ($count > $skip);
3964 }
3965 if ($upto){
3966 last if ($count > $upto);
3967 }
3968
3969 $sequence = uc$sequence; # make input file case insensitive
3970
3971 # detecting if the input file contains tab stops, as this is likely to result in no alignments
3972 if (index($header,"\t") != -1){
3973 $seqID_contains_tabs++;
3974 }
3975
3976 ## small check if the sequence seems to be in FastA format
3977 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
3978
3979 if ($read_number == 1){
3980 if ($bowtie2){
3981 $header =~ s/$/\/1\/1/;
3982 }
3983 else{
3984 $header =~ s/$/\/1/;
3985 }
3986 }
3987 elsif ($read_number == 2){
3988 if ($bowtie2){
3989 $header =~ s/$/\/2\/2/;
3990 }
3991 else{
3992 $header =~ s/$/\/2/;
3993 }
3994 }
3995 else{
3996 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
3997 }
3998 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
3999
4000 $sequence_C_to_T =~ tr/C/T/;
4001 $sequence_G_to_A =~ tr/G/A/;
4002
4003 if ($directional){
4004
4005 if ($read_number == 1){
4006 print CTOT "$header$sequence_C_to_T";
4007 }
4008 elsif ($read_number == 2){
4009 print GTOA "$header$sequence_G_to_A";
4010 }
4011 }
4012 else{
4013 print CTOT "$header$sequence_C_to_T";
4014 print GTOA "$header$sequence_G_to_A";
4015 }
4016 }
4017
4018 if ($directional){
4019 if ($read_number == 1){
4020 print "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
4021 }
4022 else{
4023 print "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
4024 }
4025 }
4026 else{
4027 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
4028 }
4029
4030 if ($directional){
4031 if ($read_number == 1){
4032 return ($C_to_T_infile);
4033 }
4034 else{
4035 return ($G_to_A_infile);
4036 }
4037 }
4038 else{
4039 return ($C_to_T_infile,$G_to_A_infile);
4040 }
4041 }
4042
4043
4044 sub biTransformFastQFiles {
4045 my $file = shift;
4046 my ($dir,$filename);
4047 if ($file =~ /\//){
4048 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4049 }
4050 else{
4051 $filename = $file;
4052 }
4053
4054 ### gzipped version of the infile
4055 if ($file =~ /\.gz$/){
4056 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4057 }
4058 else{
4059 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4060 }
4061
4062 if ($skip){
4063 warn "Skipping the first $skip reads from $file\n";
4064 sleep (1);
4065 }
4066 if ($upto){
4067 warn "Processing reads up to sequence no. $upto from $file\n";
4068 sleep (1);
4069 }
4070
4071 my $C_to_T_infile = my $G_to_A_infile = $filename;
4072
4073 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
4074 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4075 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4076
4077 unless ($directional){
4078 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4079 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4080 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4081 }
4082
4083 my $count = 0;
4084 while (1){
4085 my $identifier = <IN>;
4086 my $sequence = <IN>;
4087 my $identifier2 = <IN>;
4088 my $quality_score = <IN>;
4089 last unless ($identifier and $sequence and $identifier2 and $quality_score);
4090
4091 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
4092
4093 ++$count;
4094
4095 if ($skip){
4096 next unless ($count > $skip);
4097 }
4098 if ($upto){
4099 last if ($count > $upto);
4100 }
4101
4102 $sequence = uc$sequence; # make input file case insensitive
4103
4104 # detecting if the input file contains tab stops, as this is likely to result in no alignments
4105 if (index($identifier,"\t") != -1){
4106 $seqID_contains_tabs++;
4107 }
4108
4109 ## small check if the sequence file appears to be a FastQ file
4110 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
4111 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
4112 }
4113
4114 my $sequence_C_to_T = $sequence;
4115 $sequence_C_to_T =~ tr/C/T/;
4116 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4117
4118 unless ($directional){
4119 my $sequence_G_to_A = $sequence;
4120 $sequence_G_to_A =~ tr/G/A/;
4121 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4122 }
4123 }
4124
4125 if ($directional){
4126 print "\nCreated C -> T converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4127 }
4128 else{
4129 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4130 }
4131
4132 return ($C_to_T_infile,$G_to_A_infile);
4133 }
4134
4135 sub biTransformFastQFiles_paired_end {
4136 my ($file,$read_number) = @_;
4137 my ($dir,$filename);
4138
4139 if ($file =~ /\//){
4140 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
4141 }
4142 else{
4143 $filename = $file;
4144 }
4145
4146 ### gzipped version of the infile
4147 if ($file =~ /\.gz$/){
4148 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
4149 }
4150 else{
4151 open (IN,$file) or die "Couldn't read from file $file: $!\n";
4152 }
4153
4154 if ($skip){
4155 warn "Skipping the first $skip reads from $file\n";
4156 sleep (1);
4157 }
4158 if ($upto){
4159 warn "Processing reads up to sequence no. $upto from $file\n";
4160 sleep (1);
4161 }
4162
4163 my $C_to_T_infile = my $G_to_A_infile = $filename;
4164 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
4165 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
4166
4167 if ($directional){
4168 if ($read_number == 1){
4169 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4170 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4171 }
4172 elsif ($read_number == 2){
4173 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4174 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4175 }
4176 else{
4177 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
4178 }
4179 }
4180 else{
4181 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
4182 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
4183 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
4184 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
4185 }
4186
4187 my $count = 0;
4188
4189 while (1){
4190 my $identifier = <IN>;
4191 my $sequence = <IN>;
4192 my $identifier2 = <IN>;
4193 my $quality_score = <IN>;
4194 last unless ($identifier and $sequence and $identifier2 and $quality_score);
4195 ++$count;
4196
4197 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
4198
4199 if ($skip){
4200 next unless ($count > $skip);
4201 }
4202 if ($upto){
4203 last if ($count > $upto);
4204 }
4205
4206 $sequence= uc$sequence; # make input file case insensitive
4207
4208 ## small check if the sequence file appears to be a FastQ file
4209 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
4210 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
4211 }
4212 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
4213
4214 if ($read_number == 1){
4215 if ($bowtie2){
4216 $identifier =~ s/$/\/1\/1/;
4217 }
4218 else{
4219 $identifier =~ s/$/\/1/;
4220 }
4221 }
4222 elsif ($read_number == 2){
4223 if ($bowtie2){
4224 $identifier =~ s/$/\/2\/2/;
4225 }
4226 else{
4227 $identifier =~ s/$/\/2/;
4228 }
4229 }
4230 else{
4231 die "Read number needs to be 1 or 2\n";
4232 }
4233
4234 $sequence_C_to_T =~ tr/C/T/;
4235 $sequence_G_to_A =~ tr/G/A/;
4236
4237 if ($directional){
4238 if ($read_number == 1){
4239 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4240 }
4241 else{
4242 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4243 }
4244 }
4245 else{
4246 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
4247 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
4248 }
4249 }
4250
4251 if ($directional){
4252 if ($read_number == 1){
4253 print "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
4254 }
4255 else{
4256 print "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
4257 }
4258 }
4259 else{
4260 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
4261 }
4262 if ($directional){
4263 if ($read_number == 1){
4264 return ($C_to_T_infile);
4265 }
4266 else{
4267 return ($G_to_A_infile);
4268 }
4269 }
4270 else{
4271 return ($C_to_T_infile,$G_to_A_infile);
4272 }
4273 }
4274
4275 sub fix_IDs{
4276 my $id = shift;
4277 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
4278 return $id;
4279 }
4280
4281 sub ensure_sensical_alignment_orientation_single_end{
4282 my $index = shift; # index number if the sequence produced an alignment
4283 my $strand = shift;
4284 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
4285 my $orientation = 0;
4286 ##############################################################################################################
4287 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
4288 ## here we only want reads in the forward (+) orientation
4289 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
4290 ### if the alignment is (+) we count it, and return 1 for a correct orientation
4291 if ($strand eq '+') {
4292 $fhs[$index]->{seen}++;
4293 $orientation = 1;
4294 return $orientation;
4295 }
4296 ### if the orientation equals (-) the alignment is nonsensical
4297 elsif ($strand eq '-') {
4298 $fhs[$index]->{wrong_strand}++;
4299 return $orientation;
4300 }
4301 }
4302 ###############################################################################################################
4303 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
4304 ## here we only want reads in the forward (-) orientation
4305 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
4306 ### if the alignment is (-) we count it and return 1 for a correct orientation
4307 if ($strand eq '-') {
4308 $fhs[$index]->{seen}++;
4309 $orientation = 1;
4310 return $orientation;
4311 }
4312 ### if the orientation equals (+) the alignment is nonsensical
4313 elsif ($strand eq '+') {
4314 $fhs[$index]->{wrong_strand}++;
4315 return $orientation;
4316 }
4317 }
4318 ###############################################################################################################
4319 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
4320 ## here we only want reads in the forward (-) orientation
4321 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
4322 ### if the alignment is (-) we count it and return 1 for a correct orientation
4323 if ($strand eq '-') {
4324 $fhs[$index]->{seen}++;
4325 $orientation = 1;
4326 return $orientation;
4327 }
4328 ### if the orientation equals (+) the alignment is nonsensical
4329 elsif ($strand eq '+') {
4330 $fhs[$index]->{wrong_strand}++;
4331 return $orientation;
4332 }
4333 }
4334 ###############################################################################################################
4335 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
4336 ## here we only want reads in the forward (+) orientation
4337 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
4338 ### if the alignment is (+) we count it and return 1 for a correct orientation
4339 if ($strand eq '+') {
4340 $fhs[$index]->{seen}++;
4341 $orientation = 1;
4342 return $orientation;
4343 }
4344 ### if the orientation equals (-) the alignment is nonsensical
4345 elsif ($strand eq '-') {
4346 $fhs[$index]->{wrong_strand}++;
4347 return $orientation;
4348 }
4349 } else{
4350 die "One of the above conditions must be true\n";
4351 }
4352 }
4353
4354 sub ensure_sensical_alignment_orientation_paired_ends{
4355 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
4356 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
4357 my $orientation = 0;
4358 ##############################################################################################################
4359 ## [Index 0, sequence originated from (converted) forward strand]
4360 ## CT converted read 1
4361 ## GA converted read 2
4362 ## CT converted genome
4363 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
4364 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
4365 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
4366 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4367 $fhs[$index]->{seen}++;
4368 $orientation = 1;
4369 return $orientation;
4370 }
4371 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4372 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4373 $fhs[$index]->{wrong_strand}++;
4374 return $orientation;
4375 }
4376 else{
4377 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4378 }
4379 }
4380 ###############################################################################################################
4381 ## [Index 1, sequence originated from (converted) reverse strand]
4382 ## GA converted read 1
4383 ## CT converted read 2
4384 ## GA converted genome
4385 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
4386 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
4387 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
4388 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4389 $fhs[$index]->{seen}++;
4390 $orientation = 1;
4391 return $orientation;
4392 }
4393 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4394 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4395 $fhs[$index]->{wrong_strand}++;
4396 return $orientation;
4397 }
4398 else{
4399 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4400 }
4401 }
4402 ###############################################################################################################
4403 ## [Index 2, sequence originated from complementary to (converted) forward strand]
4404 ## GA converted read 1
4405 ## CT converted read 2
4406 ## CT converted genome
4407 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
4408 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
4409 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
4410 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4411 $fhs[$index]->{seen}++;
4412 $orientation = 1;
4413 return $orientation;
4414 }
4415 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4416 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4417 $fhs[$index]->{wrong_strand}++;
4418 return $orientation;
4419 }
4420 else{
4421 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4422 }
4423 }
4424 ###############################################################################################################
4425 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
4426 ## CT converted read 1
4427 ## GA converted read 2
4428 ## GA converted genome
4429 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
4430 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
4431 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
4432 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
4433 $fhs[$index]->{seen}++;
4434 $orientation = 1;
4435 return $orientation;
4436 }
4437 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
4438 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
4439 $fhs[$index]->{wrong_strand}++;
4440 return $orientation;
4441 }
4442 else{
4443 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
4444 }
4445 }
4446 else{
4447 die "One of the above conditions must be true\n";
4448 }
4449 }
4450
4451 #####################################################################################################################################################
4452
4453 ### Bowtie 1 (default) | PAIRED-END | FASTA
4454
4455 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
4456
4457 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
4458
4459 if ($directional){
4460 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
4461 }
4462 else{
4463 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
4464 }
4465
4466 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
4467 ## data structure above
4468 if ($directional){
4469 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4470 }
4471 else{
4472 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4473 }
4474
4475 foreach my $fh (@fhs) {
4476
4477 if ($directional){
4478 unless ($fh->{inputfile_1}){
4479 $fh->{last_seq_id} = undef;
4480 $fh->{last_line_1} = undef;
4481 $fh->{last_line_2} = undef;
4482 next;
4483 }
4484 }
4485
4486 my $bt_options = $bowtie_options;
4487 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
4488 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4489 }
4490 else {
4491 $bt_options .= ' --nofw';
4492 }
4493
4494 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
4495 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
4496
4497 my $line_1 = $fh->{fh}->getline();
4498 my $line_2 = $fh->{fh}->getline();
4499
4500 # if Bowtie produces an alignment we store the first line of the output
4501 if ($line_1 and $line_2) {
4502 chomp $line_1;
4503 chomp $line_2;
4504 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
4505 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
4506
4507 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
4508 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
4509
4510 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
4511 $fh->{last_seq_id} = $id_1;
4512 }
4513 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
4514 $fh->{last_seq_id} = $id_2;
4515 }
4516 else{
4517 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
4518 }
4519
4520 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
4521 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
4522 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
4523 }
4524 # otherwise we just initialise last_seq_id and last_lines as undefined
4525 else {
4526 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
4527 $fh->{last_seq_id} = undef;
4528 $fh->{last_line_1} = undef;
4529 $fh->{last_line_2} = undef;
4530 }
4531 }
4532 }
4533
4534 ### Bowtie 2 | PAIRED-END | FASTA
4535
4536 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
4537 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
4538 if ($directional){
4539 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
4540 }
4541 else{
4542 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
4543 }
4544
4545 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
4546 ## data structure above
4547 if ($directional){
4548 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4549 }
4550 else{
4551 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4552 }
4553
4554 foreach my $fh (@fhs) {
4555
4556 if ($directional){
4557 unless ($fh->{inputfile_1}){
4558 $fh->{last_seq_id} = undef;
4559 $fh->{last_line_1} = undef;
4560 $fh->{last_line_2} = undef;
4561 next;
4562 }
4563 }
4564
4565 my $bt2_options = $bowtie_options;
4566 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
4567 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4568 }
4569 else {
4570 $bt2_options .= ' --nofw';
4571 }
4572
4573 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
4574 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
4575
4576 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
4577 while (1){
4578 $_ = $fh->{fh}->getline();
4579 if ($_) {
4580 last unless ($_ =~ /^\@/); # SAM headers start with @
4581 }
4582 else{
4583 last; # no alignment output
4584 }
4585 }
4586
4587 my $line_1 = $_;
4588 my $line_2 = $fh->{fh}->getline();
4589
4590 # if Bowtie produces an alignment we store the first line of the output
4591 if ($line_1 and $line_2) {
4592 chomp $line_1;
4593 chomp $line_2;
4594 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
4595 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
4596
4597 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
4598 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
4599
4600 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
4601 $fh->{last_seq_id} = $id_1;
4602 }
4603 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
4604 $fh->{last_seq_id} = $id_2;
4605 }
4606 else{
4607 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
4608 }
4609
4610 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
4611 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
4612 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
4613 }
4614 # otherwise we just initialise last_seq_id and last_lines as undefined
4615 else {
4616 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
4617 $fh->{last_seq_id} = undef;
4618 $fh->{last_line_1} = undef;
4619 $fh->{last_line_2} = undef;
4620 }
4621 }
4622 }
4623
4624 ### Bowtie 1 (default) | PAIRED-END | FASTQ
4625
4626 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
4627 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
4628 if ($directional){
4629 print "Input files are $C_to_T_infile_1 $G_to_A_infile_2 (FastQ)\n";
4630 }
4631 else{
4632 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
4633 }
4634
4635 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
4636 ## data structure above
4637 if ($directional){
4638 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4639 }
4640 else{
4641 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4642 }
4643
4644 foreach my $fh (@fhs) {
4645
4646 if ($directional){
4647 unless ($fh->{inputfile_1}){
4648 $fh->{last_seq_id} = undef;
4649 $fh->{last_line_1} = undef;
4650 $fh->{last_line_2} = undef;
4651 next;
4652 }
4653 }
4654
4655 my $bt_options = $bowtie_options;
4656 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
4657 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4658 }
4659 else {
4660 $bt_options .= ' --nofw';
4661 }
4662
4663 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options))\n";
4664 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
4665
4666 my $line_1 = $fh->{fh}->getline();
4667 my $line_2 = $fh->{fh}->getline();
4668
4669 # if Bowtie produces an alignment we store the first line of the output
4670 if ($line_1 and $line_2) {
4671 chomp $line_1;
4672 chomp $line_2;
4673 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
4674 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
4675
4676 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
4677 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
4678
4679 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
4680 $fh->{last_seq_id} = $id_1;
4681 }
4682 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
4683 $fh->{last_seq_id} = $id_2;
4684 }
4685 else{
4686 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
4687 }
4688
4689 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
4690 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
4691 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
4692 }
4693
4694 # otherwise we just initialise last_seq_id and last_lines as undefined
4695 else {
4696 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
4697 $fh->{last_seq_id} = undef;
4698 $fh->{last_line_1} = undef;
4699 $fh->{last_line_2} = undef;
4700 }
4701 }
4702 }
4703
4704 ### Bowtie 2 | PAIRED-END | FASTQ
4705
4706 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
4707 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
4708 if ($directional){
4709 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
4710 }
4711 else{
4712 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
4713 }
4714
4715 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
4716 ## data structure above
4717 if ($directional){
4718 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4719 }
4720 else{
4721 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4722 }
4723
4724 foreach my $fh (@fhs) {
4725
4726 if ($directional){
4727 unless ($fh->{inputfile_1}){
4728 $fh->{last_seq_id} = undef;
4729 $fh->{last_line_1} = undef;
4730 $fh->{last_line_2} = undef;
4731 next;
4732 }
4733 }
4734
4735 my $bt2_options = $bowtie_options;
4736 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
4737 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4738 }
4739 else {
4740 $bt2_options .= ' --nofw';
4741 }
4742
4743 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
4744 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
4745
4746 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
4747 while (1){
4748 $_ = $fh->{fh}->getline();
4749 if ($_) {
4750 last unless ($_ =~ /^\@/); # SAM headers start with @
4751 }
4752 else{
4753 last; # no alignment output
4754 }
4755 }
4756
4757 my $line_1 = $_;
4758 my $line_2 = $fh->{fh}->getline();
4759
4760 # if Bowtie produces an alignment we store the first line of the output
4761 if ($line_1 and $line_2) {
4762 chomp $line_1;
4763 chomp $line_2;
4764 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
4765 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
4766
4767 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
4768 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
4769
4770 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
4771 $fh->{last_seq_id} = $id_1;
4772 }
4773 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
4774 $fh->{last_seq_id} = $id_2;
4775 }
4776 else{
4777 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
4778 }
4779
4780 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
4781 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
4782 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
4783 }
4784
4785 # otherwise we just initialise last_seq_id and last_lines as undefined
4786 else {
4787 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
4788 $fh->{last_seq_id} = undef;
4789 $fh->{last_line_1} = undef;
4790 $fh->{last_line_2} = undef;
4791 }
4792 }
4793 }
4794
4795 #####################################################################################################################################################
4796
4797 ### Bowtie 1 (default) | SINGLE-END | FASTA
4798 sub single_end_align_fragments_to_bisulfite_genome_fastA {
4799 my ($C_to_T_infile,$G_to_A_infile) = @_;
4800 if ($directional){
4801 print "Input file is $C_to_T_infile (FastA)\n";
4802 }
4803 else{
4804 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
4805 }
4806
4807 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
4808 ## data structure above
4809 if ($directional){
4810 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4811 }
4812 else{
4813 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4814 }
4815
4816 foreach my $fh (@fhs) {
4817
4818 my $bt_options = $bowtie_options;
4819 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
4820 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4821 }
4822 else {
4823 $bt_options .= ' --nofw';
4824 }
4825
4826 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
4827 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
4828
4829 # if Bowtie produces an alignment we store the first line of the output
4830 $_ = $fh->{fh}->getline();
4831 if ($_) {
4832 chomp;
4833 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
4834 $fh->{last_seq_id} = $id;
4835 $fh->{last_line} = $_;
4836 warn "Found first alignment:\t$fh->{last_line}\n";
4837 }
4838 # otherwise we just initialise last_seq_id and last_line as undefined
4839 else {
4840 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
4841 $fh->{last_seq_id} = undef;
4842 $fh->{last_line} = undef;
4843 }
4844 }
4845 }
4846
4847 ### Bowtie 2 | SINGLE-END | FASTA
4848 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
4849 my ($C_to_T_infile,$G_to_A_infile) = @_;
4850 if ($directional){
4851 print "Input file is $C_to_T_infile (FastA)\n";
4852 }
4853 else{
4854 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
4855 }
4856
4857 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
4858 ## data structure above
4859 if ($directional){
4860 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4861 }
4862 else{
4863 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4864 }
4865
4866 foreach my $fh (@fhs) {
4867
4868 my $bt2_options = $bowtie_options;
4869 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
4870 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4871 }
4872 else {
4873 $bt2_options .= ' --nofw';
4874 }
4875
4876 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
4877 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
4878
4879 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
4880 while (1){
4881 $_ = $fh->{fh}->getline();
4882 if ($_) {
4883 last unless ($_ =~ /^\@/); # SAM headers start with @
4884 }
4885 else{
4886 last; # no alignment output
4887 }
4888 }
4889
4890 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
4891 if ($_) {
4892 chomp;
4893 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
4894 $fh->{last_seq_id} = $id;
4895 $fh->{last_line} = $_;
4896 warn "Found first alignment:\t$fh->{last_line}\n";
4897 }
4898 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
4899 else {
4900 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
4901 $fh->{last_seq_id} = undef;
4902 $fh->{last_line} = undef;
4903 }
4904 }
4905 }
4906
4907
4908 ### Bowtie 1 (default) | SINGLE-END | FASTQ
4909 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
4910 my ($C_to_T_infile,$G_to_A_infile) = @_;
4911 if ($directional){
4912 print "Input file is $C_to_T_infile (FastQ)\n";
4913 }
4914 else{
4915 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
4916 }
4917
4918 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
4919 ## the data structure above
4920 if ($directional){
4921 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4922 }
4923 else{
4924 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4925 }
4926
4927 foreach my $fh (@fhs) {
4928 my $bt_options = $bowtie_options;
4929 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
4930 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4931 }
4932 else {
4933 $bt_options .= ' --nofw';
4934 }
4935
4936 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
4937 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
4938
4939 # if Bowtie produces an alignment we store the first line of the output
4940 $_ = $fh->{fh}->getline();
4941 if ($_) {
4942 chomp;
4943 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
4944 $fh->{last_seq_id} = $id;
4945 $fh->{last_line} = $_;
4946 warn "Found first alignment:\t$fh->{last_line}\n";
4947 }
4948 # otherwise we just initialise last_seq_id and last_line as undefined
4949 else {
4950 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
4951 $fh->{last_seq_id} = undef;
4952 $fh->{last_line} = undef;
4953 }
4954 }
4955 }
4956
4957 ### Bowtie 2 | SINGLE-END | FASTQ
4958 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
4959 my ($C_to_T_infile,$G_to_A_infile) = @_;
4960 if ($directional){
4961 print "Input file is $C_to_T_infile (FastQ)\n\n";
4962 }
4963 else{
4964 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
4965 }
4966
4967 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
4968 ## the data structure above
4969 if ($directional){
4970 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4971 }
4972 else{
4973 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
4974 }
4975
4976 foreach my $fh (@fhs) {
4977 my $bt2_options = $bowtie_options;
4978 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
4979 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
4980 }
4981 else {
4982 $bt2_options .= ' --nofw';
4983 }
4984 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
4985 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
4986
4987 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
4988 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
4989 while (1){
4990 $_ = $fh->{fh}->getline();
4991 if ($_) {
4992 last unless ($_ =~ /^\@/); # SAM headers start with @
4993 }
4994 else {
4995 last;
4996 }
4997 }
4998
4999 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
5000 if ($_) {
5001 chomp;
5002 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
5003 $fh->{last_seq_id} = $id;
5004 $fh->{last_line} = $_;
5005 warn "Found first alignment:\t$fh->{last_line}\n";
5006 }
5007 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
5008 else {
5009 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
5010 $fh->{last_seq_id} = undef;
5011 $fh->{last_line} = undef;
5012 }
5013 }
5014 }
5015
5016 ###########################################################################################################################################
5017
5018 sub reset_counters_and_fhs{
5019 my $filename = shift;
5020 %counting=(
5021 total_meCHH_count => 0,
5022 total_meCHG_count => 0,
5023 total_meCpG_count => 0,
5024 total_unmethylated_CHH_count => 0,
5025 total_unmethylated_CHG_count => 0,
5026 total_unmethylated_CpG_count => 0,
5027 sequences_count => 0,
5028 no_single_alignment_found => 0,
5029 unsuitable_sequence_count => 0,
5030 genomic_sequence_could_not_be_extracted_count => 0,
5031 unique_best_alignment_count => 0,
5032 low_complexity_alignments_overruled_count => 0,
5033 CT_CT_count => 0, #(CT read/CT genome, original top strand)
5034 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
5035 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
5036 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
5037 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
5038 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
5039 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
5040 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
5041 alignments_rejected_count => 0, # only relevant if --directional was specified
5042 );
5043
5044 if ($directional){
5045 if ($filename =~ ','){ # paired-end files
5046 @fhs=(
5047 { name => 'CTreadCTgenome',
5048 strand_identity => 'con ori forward',
5049 bisulfiteIndex => $CT_index_basename,
5050 seen => 0,
5051 wrong_strand => 0,
5052 },
5053 { name => 'CTreadGAgenome',
5054 strand_identity => 'con ori reverse',
5055 bisulfiteIndex => $GA_index_basename,
5056 seen => 0,
5057 wrong_strand => 0,
5058 },
5059 { name => 'GAreadCTgenome',
5060 strand_identity => 'compl ori con forward',
5061 bisulfiteIndex => $CT_index_basename,
5062 seen => 0,
5063 wrong_strand => 0,
5064 },
5065 { name => 'GAreadGAgenome',
5066 strand_identity => 'compl ori con reverse',
5067 bisulfiteIndex => $GA_index_basename,
5068 seen => 0,
5069 wrong_strand => 0,
5070 },
5071 );
5072 }
5073 else{ # single-end files
5074 @fhs=(
5075 { name => 'CTreadCTgenome',
5076 strand_identity => 'con ori forward',
5077 bisulfiteIndex => $CT_index_basename,
5078 seen => 0,
5079 wrong_strand => 0,
5080 },
5081 { name => 'CTreadGAgenome',
5082 strand_identity => 'con ori reverse',
5083 bisulfiteIndex => $GA_index_basename,
5084 seen => 0,
5085 wrong_strand => 0,
5086 },
5087 );
5088 }
5089 }
5090 else{
5091 @fhs=(
5092 { name => 'CTreadCTgenome',
5093 strand_identity => 'con ori forward',
5094 bisulfiteIndex => $CT_index_basename,
5095 seen => 0,
5096 wrong_strand => 0,
5097 },
5098 { name => 'CTreadGAgenome',
5099 strand_identity => 'con ori reverse',
5100 bisulfiteIndex => $GA_index_basename,
5101 seen => 0,
5102 wrong_strand => 0,
5103 },
5104 { name => 'GAreadCTgenome',
5105 strand_identity => 'compl ori con forward',
5106 bisulfiteIndex => $CT_index_basename,
5107 seen => 0,
5108 wrong_strand => 0,
5109 },
5110 { name => 'GAreadGAgenome',
5111 strand_identity => 'compl ori con reverse',
5112 bisulfiteIndex => $GA_index_basename,
5113 seen => 0,
5114 wrong_strand => 0,
5115 },
5116 );
5117 }
5118 }
5119
5120
5121 sub process_command_line{
5122 my @bowtie_options;
5123 my $help;
5124 my $mates1;
5125 my $mates2;
5126 my $path_to_bowtie;
5127 my $fastq;
5128 my $fasta;
5129 my $skip;
5130 my $qupto;
5131 my $phred64;
5132 my $phred33;
5133 my $solexa;
5134 my $mismatches;
5135 my $seed_length;
5136 my $best;
5137 my $sequence_format;
5138 my $version;
5139 my $quiet;
5140 my $chunk;
5141 my $non_directional;
5142 my $ceiling;
5143 my $maxins;
5144 my $minins;
5145 my $unmapped;
5146 my $multi_map;
5147 my $output_dir;
5148 my $bowtie2;
5149 my $vanilla;
5150 my $sam_no_hd;
5151 my $seed_extension_fails;
5152 my $reseed_repetitive_seeds;
5153 my $most_valid_alignments;
5154 my $score_min;
5155 my $parallel;
5156 my $temp_dir;
5157 my $rdg;
5158 my $rfg;
5159
5160 my $command_line = GetOptions ('help|man' => \$help,
5161 '1=s' => \$mates1,
5162 '2=s' => \$mates2,
5163 'path_to_bowtie=s' => \$path_to_bowtie,
5164 'f|fasta' => \$fasta,
5165 'q|fastq' => \$fastq,
5166 's|skip=i' => \$skip,
5167 'u|upto=i' => \$qupto,
5168 'phred33-quals' => \$phred33,
5169 'phred64-quals|solexa1' => \$phred64,
5170 'solexa-quals' => \$solexa,
5171 'n|seedmms=i' => \$mismatches,
5172 'l|seedlen=i' => \$seed_length,
5173 'no_best' => \$best,
5174 'version' => \$version,
5175 'quiet' => \$quiet,
5176 'chunkmbs=i' => \$chunk,
5177 'non_directional' => \$non_directional,
5178 'I|minins=i' => \$minins,
5179 'X|maxins=i' => \$maxins,
5180 'e|maqerr=i' => \$ceiling,
5181 'un|unmapped' => \$unmapped,
5182 'ambiguous' => \$multi_map,
5183 'o|output_dir=s' => \$output_dir,
5184 'bowtie2' => \$bowtie2,
5185 'vanilla' => \$vanilla,
5186 'sam-no-hd' => \$sam_no_hd,
5187 'D=i' => \$seed_extension_fails,
5188 'R=i' => \$reseed_repetitive_seeds,
5189 'score_min=s' => \$score_min,
5190 'most_valid_alignments=i' => \$most_valid_alignments,
5191 'p=i' => \$parallel,
5192 'temp_dir=s' => \$temp_dir,
5193 'rdg=s' => \$rdg,
5194 'rfg=s' => \$rfg,
5195 );
5196
5197
5198 ### EXIT ON ERROR if there were errors with any of the supplied options
5199 unless ($command_line){
5200 die "Please respecify command line options\n";
5201 }
5202 ### HELPFILE
5203 if ($help){
5204 print_helpfile();
5205 exit;
5206 }
5207 if ($version){
5208 print << "VERSION";
5209
5210
5211 Bismark - Bisulfite Mapper and Methylation Caller.
5212
5213 Bismark Version: $bismark_version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
5214 www.bioinformatics.babraham.ac.uk/projects/
5215
5216
5217 VERSION
5218 exit;
5219 }
5220
5221
5222 ##########################
5223 ### PROCESSING OPTIONS ###
5224 ##########################
5225
5226 unless ($bowtie2){
5227 $bowtie2 = 0;
5228 }
5229 unless ($sam_no_hd){
5230 $sam_no_hd =0;
5231 }
5232
5233 ### PATH TO BOWTIE
5234 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
5235 if ($path_to_bowtie){
5236 unless ($path_to_bowtie =~ /\/$/){
5237 $path_to_bowtie =~ s/$/\//;
5238 }
5239 if (-d $path_to_bowtie){
5240 if ($bowtie2){
5241 $path_to_bowtie = "${path_to_bowtie}bowtie2";
5242 }
5243 else{
5244 $path_to_bowtie = "${path_to_bowtie}bowtie";
5245 }
5246 }
5247 else{
5248 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
5249 }
5250 }
5251 else{
5252 if ($bowtie2){
5253 $path_to_bowtie = 'bowtie2';
5254 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
5255 else{
5256 $path_to_bowtie = 'bowtie';
5257 warn "Path to Bowtie specified as: $path_to_bowtie\n";
5258 }
5259 }
5260
5261 ####################################
5262 ### PROCESSING ARGUMENTS
5263
5264 ### GENOME FOLDER
5265 my $genome_folder = shift @ARGV; # mandatory
5266 unless ($genome_folder){
5267 warn "Genome folder was not specified!\n";
5268 print_helpfile();
5269 exit;
5270 }
5271
5272 ### checking that the genome folder, all subfolders and the required bowtie index files exist
5273 unless ($genome_folder =~/\/$/){
5274 $genome_folder =~ s/$/\//;
5275 }
5276
5277 if (chdir $genome_folder){
5278 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
5279 unless ($absolute_genome_folder =~/\/$/){
5280 $absolute_genome_folder =~ s/$/\//;
5281 }
5282 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
5283 $genome_folder = $absolute_genome_folder;
5284 }
5285 else{
5286 die "Failed to move to $genome_folder: $!\nUSAGE: Bismark.pl [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
5287 }
5288
5289 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
5290 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
5291
5292 if ($bowtie2){ ### Bowtie 2 (new)
5293 ### checking the integrity of $CT_dir
5294 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
5295 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
5296 foreach my $file(@CT_bowtie_index){
5297 unless (-f $file){
5298 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n";
5299 }
5300 }
5301 ### checking the integrity of $GA_dir
5302 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
5303 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
5304 foreach my $file(@GA_bowtie_index){
5305 unless (-f $file){
5306 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
5307 }
5308 }
5309 }
5310
5311 else{ ### Bowtie 1 (default)
5312 ### checking the integrity of $CT_dir
5313 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
5314 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
5315 foreach my $file(@CT_bowtie_index){
5316 unless (-f $file){
5317 die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
5318 }
5319 }
5320 ### checking the integrity of $GA_dir
5321 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
5322 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
5323 foreach my $file(@GA_bowtie_index){
5324 unless (-f $file){
5325 die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
5326 }
5327 }
5328 }
5329
5330 my $CT_index_basename = "${CT_dir}BS_CT";
5331 my $GA_index_basename = "${GA_dir}BS_GA";
5332
5333 ### INPUT OPTIONS
5334
5335 ### SEQUENCE FILE FORMAT
5336 ### exits if both fastA and FastQ were specified
5337 if ($fasta and $fastq){
5338 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
5339 }
5340
5341 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
5342 if ($fasta){
5343 print "FastA format specified\n";
5344 $sequence_format = 'FASTA';
5345 push @bowtie_options, '-f';
5346 }
5347 elsif ($fastq){
5348 print "FastQ format specified\n";
5349 $sequence_format = 'FASTQ';
5350 push @bowtie_options, '-q';
5351 }
5352 else{
5353 $fastq = 1;
5354 print "FastQ format assumed (by default)\n";
5355 $sequence_format = 'FASTQ';
5356 push @bowtie_options, '-q';
5357 }
5358
5359 ### SKIP
5360 if ($skip){
5361 warn "Skipping the first $skip reads from the input file\n";
5362 # push @bowtie_options,"-s $skip";
5363 }
5364
5365 ### UPTO
5366 if ($qupto){
5367 warn "Processing sequences up to read no. $qupto from the input file\n";
5368 if ($bowtie2){
5369 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
5370 }
5371 else{
5372 # push @bowtie_options,"--qupto $qupto";
5373 }
5374 }
5375
5376 ### QUALITY VALUES
5377 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
5378 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
5379 }
5380 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
5381 # Phred quality values work only when -q is specified
5382 unless ($fastq){
5383 die "Phred quality values works only when -q (FASTQ) is specified\n";
5384 }
5385 if ($bowtie2){
5386 push @bowtie_options,"--phred33";
5387 }
5388 else{
5389 push @bowtie_options,"--phred33-quals";
5390 }
5391 }
5392 if ($phred64){
5393 # Phred quality values work only when -q is specified
5394 unless ($fastq){
5395 die "Phred quality values work only when -q (FASTQ) is specified\n";
5396 }
5397 if ($bowtie2){
5398 push @bowtie_options,"--phred64";
5399 }
5400 else{
5401 push @bowtie_options,"--phred64-quals";
5402 }
5403 }
5404 else{
5405 $phred64 = 0;
5406 }
5407
5408 if ($solexa){
5409 if ($bowtie2){
5410 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
5411 }
5412 # Solexa to Phred value conversion works only when -q is specified
5413 unless ($fastq){
5414 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
5415 }
5416 push @bowtie_options,"--solexa-quals";
5417 }
5418 else{
5419 $solexa = 0;
5420 }
5421
5422 ### ALIGNMENT OPTIONS
5423
5424 ### MISMATCHES
5425 if (defined $mismatches){
5426 if ($bowtie2){
5427 if ($mismatches == 0 or $mismatches == 1){
5428 push @bowtie_options,"-N $mismatches";
5429 }
5430 else{
5431 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
5432 }
5433 }
5434 else{
5435 if ($mismatches >= 0 and $mismatches <= 3){
5436 push @bowtie_options,"-n $mismatches";
5437 }
5438 else{
5439 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
5440 }
5441 }
5442 }
5443 else{
5444 unless ($bowtie2){
5445 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
5446 }
5447 }
5448
5449 ### SEED LENGTH
5450 if (defined $seed_length){
5451 if ($bowtie2){
5452 push @bowtie_options,"-L $seed_length";
5453 }
5454 else{
5455 push @bowtie_options,"-l $seed_length";
5456 }
5457 }
5458
5459 ### MISMATCH CEILING
5460 if (defined $ceiling){
5461 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
5462 push @bowtie_options,"-e $ceiling";
5463 }
5464
5465
5466 ### BOWTIE 2 EFFORT OPTIONS
5467
5468 ### CONSECUTIVE SEED EXTENSION FAILS
5469 if (defined $seed_extension_fails){
5470 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
5471 push @bowtie_options,"-D $seed_extension_fails";
5472 }
5473
5474 ### RE-SEEDING REPETITIVE SEEDS
5475 if (defined $reseed_repetitive_seeds){
5476 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
5477 push @bowtie_options,"-R $reseed_repetitive_seeds";
5478 }
5479
5480
5481 ### BOWTIE 2 SCORING OPTIONS
5482 if ($score_min){
5483 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
5484 unless ($score_min =~ /^L,.+,.+$/){
5485 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
5486 }
5487 push @bowtie_options,"--score-min $score_min";
5488 }
5489 else{
5490 if ($bowtie2){
5491 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
5492 }
5493 }
5494
5495 ### BOWTIE 2 READ GAP OPTIONS
5496 if ($rdg){
5497 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
5498 unless ($rdg =~ /^.+,.+$/){
5499 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
5500 }
5501 push @bowtie_options,"--rdg $rdg";
5502 }
5503
5504 ### BOWTIE 2 REFERENCE GAP OPTIONS
5505 if ($rfg){
5506 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
5507 unless ($rfg =~ /^.+,.+$/){
5508 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
5509 }
5510 push @bowtie_options,"--rfg $rfg";
5511 }
5512
5513
5514
5515 ### BOWTIE 2 PARALLELIZATION OPTIONS
5516 if (defined $parallel){
5517 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
5518 }
5519 if ($bowtie2){
5520 if ($parallel){
5521 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
5522 push @bowtie_options,"-p $parallel";
5523 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
5524 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
5525 sleep (2);
5526 }
5527 }
5528
5529 ### REPORTING OPTIONS
5530
5531 if ($bowtie2){
5532 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
5533
5534 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
5535 if(defined $most_valid_alignments){
5536
5537 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
5538 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
5539 }
5540 # else{
5541 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
5542 # }
5543 }
5544 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
5545 push @bowtie_options,'-k 2';
5546 }
5547
5548 ### --BEST
5549 if ($bowtie2){
5550 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
5551 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
5552 }
5553 }
5554 else{
5555 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
5556 unless ($best){
5557 push @bowtie_options,'--best';
5558 }
5559 }
5560
5561 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
5562 if ($vanilla){
5563 if ($bowtie2){
5564 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
5565 }
5566 }
5567 else{
5568 $vanilla = 0;
5569 }
5570
5571 ### PAIRED-END MAPPING
5572 if ($mates1){
5573 my @mates1 = (split (/,/,$mates1));
5574 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
5575 my @mates2 = (split(/,/,$mates2));
5576 unless (scalar @mates1 == scalar @mates2){
5577 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
5578 }
5579 while (1){
5580 my $mate1 = shift @mates1;
5581 my $mate2 = shift @mates2;
5582 last unless ($mate1 and $mate2);
5583 push @filenames,"$mate1,$mate2";
5584 }
5585 if ($bowtie2){
5586 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
5587 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
5588 }
5589 }
5590 elsif ($mates2){
5591 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
5592 }
5593
5594 ### SINGLE-END MAPPING
5595 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
5596 my $singles;
5597 unless ($mates1 and $mates2){
5598 $singles = join (',',@ARGV);
5599 unless ($singles){
5600 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
5601 }
5602 $singles =~ s/\s/,/g;
5603 @filenames = (split(/,/,$singles));
5604 warn "\nFiles to be analysed:\n";
5605 warn "@filenames\n\n";
5606 sleep (3);
5607 }
5608
5609 ### MININUM INSERT SIZE (PAIRED-END ONLY)
5610 if (defined $minins){
5611 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
5612 push @bowtie_options,"--minins $minins";
5613 }
5614
5615 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
5616 if (defined $maxins){
5617 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
5618 push @bowtie_options,"--maxins $maxins";
5619 }
5620 else{
5621 unless ($singles){
5622 push @bowtie_options,'--maxins 500';
5623 }
5624 }
5625
5626 ### QUIET prints nothing besides alignments (suppresses warnings)
5627 if ($quiet){
5628 push @bowtie_options,'--quiet';
5629 }
5630
5631 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
5632 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
5633 if (defined $chunk){
5634 push @bowtie_options,"--chunkmbs $chunk";
5635 }
5636 else{
5637 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
5638 }
5639 }
5640
5641
5642 ### SUMMARY OF ALL BOWTIE OPTIONS
5643 my $bowtie_options = join (' ',@bowtie_options);
5644
5645
5646 ### STRAND-SPECIFIC LIBRARIES
5647 my $directional;
5648 if ($non_directional){
5649 print "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported.\n";
5650 sleep (3);
5651 $directional = 0;
5652 }
5653 else{
5654 print "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!).\n";
5655 sleep (3);
5656 $directional = 1; # Changed this to being the default behaviour
5657 }
5658
5659 ### UNMAPPED SEQUENCE OUTPUT
5660 $unmapped = 0 unless ($unmapped);
5661
5662 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
5663 $multi_map = 0 unless ($multi_map);
5664
5665
5666 ### OUTPUT DIRECTORY
5667
5668 chdir $parent_dir or die "Failed to move back to current working directory\n";
5669 if ($output_dir){
5670 unless ($output_dir =~ /\/$/){
5671 $output_dir =~ s/$/\//;
5672 }
5673
5674 if (chdir $output_dir){
5675 $output_dir = getcwd; # making the path absolute
5676 unless ($output_dir =~ /\/$/){
5677 $output_dir =~ s/$/\//;
5678 }
5679 }
5680 else{
5681 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
5682 warn "Created output directory $output_dir!\n\n";
5683 chdir $output_dir or die "Failed to move to $output_dir\n";
5684 $output_dir = getcwd; # making the path absolute
5685 unless ($output_dir =~ /\/$/){
5686 $output_dir =~ s/$/\//;
5687 }
5688 }
5689 warn "Output will be written into the directory: $output_dir\n";
5690 }
5691 else{
5692 $output_dir = '';
5693 }
5694
5695 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
5696
5697 chdir $parent_dir or die "Failed to move back to current working directory\n";
5698 if ($temp_dir){
5699 warn "\nUsing temp directory: $temp_dir\n";
5700 unless ($temp_dir =~ /\/$/){
5701 $temp_dir =~ s/$/\//;
5702 }
5703
5704 if (chdir $temp_dir){
5705 $temp_dir = getcwd; # making the path absolute
5706 unless ($temp_dir =~ /\/$/){
5707 $temp_dir =~ s/$/\//;
5708 }
5709 }
5710 else{
5711 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
5712 warn "Created temporary directory $temp_dir!\n\n";
5713 chdir $temp_dir or die "Failed to move to $temp_dir\n";
5714 $temp_dir = getcwd; # making the path absolute
5715 unless ($temp_dir =~ /\/$/){
5716 $temp_dir =~ s/$/\//;
5717 }
5718 }
5719 warn "Temporary files will be written into the directory: $temp_dir\n";
5720 }
5721 else{
5722 $temp_dir = '';
5723 }
5724
5725
5726 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir);
5727 }
5728
5729
5730
5731 sub generate_SAM_header{
5732 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
5733 foreach my $chr (keys %chromosomes){
5734 my $length = length ($chromosomes{$chr});
5735 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
5736 }
5737 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
5738 }
5739
5740 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
5741 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
5742
5743 sub single_end_SAM_output{
5744 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
5745 my $strand = $methylation_call_params->{$id}->{alignment_strand};
5746 my $chr = $methylation_call_params->{$id}->{chromosome};
5747 my $start = $methylation_call_params->{$id}->{position};
5748 my $stop = $methylation_call_params->{$id}->{end_position};
5749 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
5750 my $methcall = $methylation_call_params->{$id}->{methylation_call};
5751 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
5752 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
5753 my $number_of_mismatches = $methylation_call_params->{$id}->{number_of_mismatches};
5754 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
5755 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
5756 ## Bit Description Comment Value
5757 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
5758 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
5759 ## 0x4 segment unmapped --- ---
5760 ## 0x8 next segment in the template unmapped --- ---
5761 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
5762 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
5763 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
5764 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
5765 ## 0x100 secondary alignment --- ---
5766 ## 0x200 not passing quality controls --- ---
5767 ## 0x400 PCR or optical duplicate --- ---
5768
5769 #####
5770
5771 my $flag; # FLAG variable used for SAM format.
5772 if ($strand eq "+"){
5773 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
5774 $flag = 0; # 0 for "+" strand (OT)
5775 }
5776 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
5777 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
5778 }
5779 else{
5780 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
5781 }
5782 }
5783 elsif ($strand eq "-"){
5784 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
5785 $flag = 16; # 16 for "-" strand (OB)
5786 }
5787 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
5788 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
5789 }
5790 else{
5791 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
5792 }
5793 }
5794 else{
5795 die "Unexpected strand information: $strand\n\n";
5796 }
5797
5798 #####
5799
5800 my $mapq = 255; # Assume mapping quality is unavailable
5801
5802 #####
5803
5804 my $cigar;
5805 if ($bowtie2){
5806 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
5807 }
5808 else{
5809 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
5810 }
5811
5812 #####
5813
5814 my $rnext = "*"; # Paired-end variable
5815
5816 #####
5817
5818 my $pnext = 0; # Paired-end variable
5819
5820 #####
5821
5822 my $tlen = 0; # Paired-end variable
5823
5824 #####
5825
5826 if ($read_conversion eq 'CT'){
5827 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
5828 }
5829 else{
5830 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
5831 }
5832
5833 if ($strand eq '-'){
5834 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
5835 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
5836 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
5837 }
5838
5839 #####
5840
5841 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
5842 # into the reference string. hemming_dist()
5843 if ($bowtie2){
5844 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
5845 }
5846
5847 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
5848
5849 #####
5850
5851 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
5852
5853 #####
5854
5855 my $XM_tag; # Optional tag XM: Methylation Call String
5856 if ($strand eq '+'){
5857 $XM_tag = "XM:Z:$methcall";
5858 }
5859 elsif ($strand eq '-'){
5860 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
5861 }
5862
5863 #####
5864
5865 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
5866
5867 #####
5868
5869 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
5870
5871 #####
5872
5873 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
5874 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
5875 }
5876
5877
5878 sub paired_end_SAM_output{
5879 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
5880 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
5881 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
5882 my $chr = $methylation_call_params->{$id}->{chromosome};
5883 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
5884 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
5885 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
5886 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
5887 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
5888 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
5889 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
5890 my $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
5891 my $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
5892
5893 my $id_1 = $id.'/1';
5894 my $id_2 = $id.'/2';
5895
5896 # Allows all degenerate nucleotide sequences in reference genome
5897 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
5898 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
5899
5900 my $index; # used to store the srand origin of the alignment in a less convoluted way
5901
5902 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
5903 $index = 0; ## this is OT (original top strand)
5904 }
5905 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
5906 $index = 1; ## this is CTOB (complementary to OB)
5907 }
5908 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
5909 $index = 2; ## this is CTOT (complementary to OT)
5910 }
5911 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
5912 $index = 3; ## this is OB (original bottom)
5913 }
5914 else {
5915 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
5916 }
5917
5918 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
5919 ### first or last position.
5920
5921 if ($index == 0 or $index == 3){ # OT or OB
5922 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
5923 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
5924 }
5925 else{ # CTOT or CTOB
5926 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
5927 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
5928 }
5929
5930 #####
5931
5932 my $start_read_1;
5933 my $start_read_2;
5934 # adjusting end positions
5935
5936 if ($bowtie2){
5937 $start_read_1 = $methylation_call_params->{$id}->{position_1};
5938 $start_read_2 = $methylation_call_params->{$id}->{position_2};
5939 }
5940 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
5941 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
5942 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
5943 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
5944 }
5945 else{ # read 1 is on the - strand
5946 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
5947 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
5948 }
5949 }
5950
5951 #####
5952
5953 my $end_read_1;
5954 my $end_read_2;
5955 # adjusting end positions
5956
5957 if ($bowtie2){
5958 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
5959 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
5960 }
5961 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
5962 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
5963 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
5964 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
5965 }
5966 else{
5967 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
5968 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
5969 }
5970 }
5971
5972 #####
5973
5974 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
5975 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
5976 ## Bit Description Comment Value
5977 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
5978 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
5979 ## 0x4 segment unmapped --- ---
5980 ## 0x8 next segment in the template unmapped --- ---
5981 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
5982 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
5983 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
5984 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
5985 ## 0x100 secondary alignment --- ---
5986 ## 0x200 not passing quality controls --- ---
5987 ## 0x400 PCR or optical duplicate --- ---
5988
5989 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
5990
5991 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
5992 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
5993
5994 my $flag_1; # FLAG variable used for SAM format
5995 my $flag_2;
5996
5997 if ($index == 0){ # OT
5998 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
5999 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
6000 }
6001 elsif ($index == 1){ # CTOB
6002 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
6003 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
6004 }
6005 elsif ($index == 2){ # CTOT
6006 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
6007 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
6008 }
6009 elsif ($index == 3){ # OB
6010 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
6011 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
6012 }
6013
6014 #####
6015
6016 my $mapq = 255; # Mapping quality is unavailable
6017
6018 #####
6019
6020 my $cigar_1;
6021 my $cigar_2;
6022
6023 if ($bowtie2){
6024 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
6025 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
6026 }
6027 else{
6028 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
6029 $cigar_2 = length($actual_seq_2) . "M";
6030 }
6031
6032 #####
6033
6034 my $rnext = '='; # Chromosome of mate; applies to both reads
6035
6036 #####
6037
6038 my $pnext_1 = $start_read_2; # Leftmost position of mate
6039 my $pnext_2 = $start_read_1;
6040
6041 #####
6042
6043 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
6044 my $tlen_2;
6045
6046 if ($bowtie2){
6047
6048 if ($start_read_1 <= $start_read_2){
6049
6050 # Read 1 alignment is leftmost
6051
6052 if ($end_read_2 >= $end_read_1){
6053
6054 # -------------------------> read 1 reads overlapping
6055 # <------------------------- read 2
6056 #
6057 # or
6058 #
6059 # -------------------------> read 1
6060 # <----------------------- read 2 read 2 contained within read 1
6061 #
6062 # or
6063 #
6064 # -------------------------> read 1 reads 1 and 2 exactly overlapping
6065 # <------------------------- read 2
6066 #
6067
6068 # dovetailing of reads is not enabled for Bowtie 2 alignments
6069
6070 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
6071 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
6072 }
6073 elsif ($end_read_2 < $end_read_1){
6074
6075 # -------------------------> read 1
6076 # <----------- read 2 read 2 contained within read 1
6077 #
6078 # or
6079 #
6080 # -------------------------> read 1
6081 # <----------- read 2 read 2 contained within read 1
6082
6083 # start and end of read 2 are fully contained within read 1
6084 $tlen_1 = 0; # Set as 0 when the information is unavailable
6085 $tlen_2 = 0; # Set as 0 when the information is unavailable
6086 }
6087
6088 }
6089
6090 elsif ($start_read_2 < $start_read_1){
6091
6092 if ($end_read_1 >= $end_read_2){
6093
6094 # Read 2 alignment is leftmost
6095
6096 # -------------------------> read 2 reads overlapping
6097 # <------------------------- read 1
6098 #
6099 # or
6100 #
6101 # -------------------------> read 2
6102 # <----------------------- read 1 read 1 contained within read 2
6103 #
6104 #
6105
6106 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
6107 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
6108 }
6109 elsif ($end_read_1 < $end_read_2){
6110
6111 # -------------------------> read 2
6112 # <----------- read 1 read 1 contained within read 2
6113 #
6114 # or
6115 #
6116 # -------------------------> read 2
6117 # <----------- read 1 read 1 contained within read 2
6118
6119 # start and end of read 1 are fully contained within read 2
6120 $tlen_1 = 0; # Set as 0 when the information is unavailable
6121 $tlen_2 = 0; # Set as 0 when the information is unavailable
6122 }
6123 }
6124 }
6125
6126 else{ # Bowtie 1
6127
6128 if ($end_read_2 >= $end_read_1){
6129 # Read 1 alignment is leftmost
6130 # -------------------------> read 1
6131 # <------------------------- read 2
6132 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
6133
6134 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
6135 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
6136 }
6137 else{
6138 # Read 2 alignment is leftmost
6139 # -------------------------> read 2
6140 # <------------------------- read 1
6141 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
6142
6143 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
6144 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
6145 }
6146 }
6147
6148 #####
6149
6150 # adjusting the strand of the sequence before we use them to generate mismatch strings
6151 if ($strand_1 eq '-'){
6152 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
6153 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
6154 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
6155 }
6156 if ($strand_2 eq '-'){
6157 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
6158 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
6159 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
6160 }
6161
6162 # print "$actual_seq_1\n$ref_seq_1\n\n";
6163 # print "$actual_seq_2\n$ref_seq_2\n\n";
6164
6165 #####
6166
6167 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
6168 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
6169 if ($bowtie2){
6170 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
6171 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
6172 }
6173 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
6174 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
6175
6176 #####
6177
6178 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
6179 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
6180
6181 #####
6182
6183 my $XM_tag_1; # Optional tag XM: Methylation call string
6184 my $XM_tag_2;
6185
6186 if ($strand_1 eq '-'){
6187 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
6188 }
6189 else{
6190 $XM_tag_1 = "XM:Z:$methcall_1";
6191 }
6192
6193 if ($strand_2 eq '-'){
6194 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
6195 }
6196 else{
6197 $XM_tag_2 = "XM:Z:$methcall_2";
6198 }
6199
6200 #####
6201
6202 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
6203 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
6204
6205 #####
6206
6207 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
6208
6209 #####
6210
6211 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
6212 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
6213 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
6214 }
6215
6216 sub revcomp{
6217 my $seq = shift or die "Missing seq to reverse complement\n";
6218 $seq = reverse $seq;
6219 $seq =~ tr/ACTGactg/TGACTGAC/;
6220 return $seq;
6221 }
6222
6223 sub hemming_dist{
6224 my $matches = 0;
6225 my @actual_seq = split //,(shift @_);
6226 my @ref_seq = split //,(shift @_);
6227 foreach (0..$#actual_seq){
6228 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
6229 }
6230 return my $hd = scalar @actual_seq - $matches;
6231 }
6232
6233 sub make_mismatch_string{
6234 my $actual_seq = shift or die "Missing actual sequence";
6235 my $ref_seq = shift or die "Missing reference sequence";
6236 my $XX_tag = "XX:Z:";
6237 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
6238 my $prev_mm_pos = 0;
6239 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
6240 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
6241 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
6242 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
6243 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
6244 $prev_mm_pos = pos($tmp); # Position of last mismatch
6245 }
6246 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
6247 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
6248 return $XX_tag;
6249 }
6250
6251
6252
6253 sub print_helpfile{
6254 print << "HOW_TO";
6255
6256
6257 This program is free software: you can redistribute it and/or modify
6258 it under the terms of the GNU General Public License as published by
6259 the Free Software Foundation, either version 3 of the License, or
6260 (at your option) any later version.
6261
6262 This program is distributed in the hope that it will be useful,
6263 but WITHOUT ANY WARRANTY; without even the implied warranty of
6264 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
6265 GNU General Public License for more details.
6266 You should have received a copy of the GNU General Public License
6267 along with this program. If not, see <http://www.gnu.org/licenses/>.
6268
6269
6270
6271 DESCRIPTION
6272
6273
6274 The following is a brief description of command line options and arguments to control the Bismark
6275 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
6276 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
6277 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
6278 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
6279 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
6280 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
6281 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
6282 sequence from the genome and determine if there were any protected C's present or not.
6283
6284 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
6285 re-enabled by using --non_directional.
6286
6287 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
6288 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
6289 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
6290
6291
6292 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
6293
6294
6295 ARGUMENTS:
6296
6297 <genome_folder> The path to the folder containing the unmodified reference genome
6298 as well as the subfolders created by the Bismark_Genome_Preparation
6299 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
6300 Bismark expects one or more fastA files in this folder (file extension: .fa
6301 or .fasta). The path can be relative or absolute.
6302
6303 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
6304 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
6305 correspond file-for-file and read-for-read with those specified in <mates2>.
6306 Reads may be a mix of different lengths. Bismark will produce one mapping result
6307 and one report file per paired-end input file pair.
6308
6309 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
6310 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
6311 correspond file-for-file and read-for-read with those specified in <mates1>.
6312 Reads may be a mix of different lengths.
6313
6314 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
6315 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
6316 produce one mapping result and one report file per input file.
6317
6318
6319 OPTIONS:
6320
6321
6322 Input:
6323
6324 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
6325 files (usually having extension .fg or .fastq). This is the default. See also
6326 --solexa-quals.
6327
6328 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
6329 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
6330 are assumed to be 40 on the Phred scale.
6331
6332 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
6333
6334 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
6335
6336 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
6337
6338 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
6339
6340 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
6341 (which can't). The formula for conversion is:
6342 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
6343 is usually the right option for use with (unconverted) reads emitted by the GA
6344 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
6345
6346 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
6347 reads emitted by GA Pipeline version 1.3 or later. Default: off.
6348
6349 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
6350 specified it is assumed that Bowtie (1 or 2) is in the PATH.
6351
6352
6353 Alignment:
6354
6355 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
6356 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
6357 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
6358
6359 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
6360 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
6361 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
6362
6363 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
6364 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
6365 quality values to the nearest 10 and saturates at 30. This value is not relevant for
6366 Bowtie 2.
6367
6368 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
6369 --best mode. Best-first search must keep track of many paths at once to ensure it is
6370 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
6371 memory impact of the descriptors, but they can still grow very large in some cases. If
6372 you receive an error message saying that chunk memory has been exhausted in --best mode,
6373 try adjusting this parameter up to dedicate more memory to the descriptors. This value
6374 is not relevant for Bowtie 2. Default: 512.
6375
6376 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
6377 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
6378 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
6379 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
6380
6381 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
6382 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
6383 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
6384 A 61-bp gap would not be valid in that case. Default: 500.
6385
6386
6387 Bowtie 1 Reporting:
6388
6389 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
6390 will be used by default.
6391
6392 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
6393 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
6394 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
6395 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
6396 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
6397 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
6398 the best alignment). --best mode also removes all strand bias. Note that --best does not
6399 affect which alignments are considered "valid" by Bowtie, only which valid alignments
6400 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
6401 Default: on.
6402
6403 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
6404 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
6405
6406
6407 Output:
6408
6409 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
6410 bisulfite strands will be reported. Default: OFF.
6411
6412 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
6413 to the original strands are merely theoretical and should not exist in reality. Specifying directional
6414 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
6415 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
6416 for sprand-specific libraries).
6417
6418 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
6419 split up into several smaller files to run concurrently and the output files are to be merged.
6420
6421 --quiet Print nothing besides alignments.
6422
6423 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
6424 of SAM format output.
6425
6426 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
6427 appear as they did in the input, without any translation of quality values that may have
6428 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
6429 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
6430 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
6431 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
6432
6433 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
6434 mismatches or other reads that fail to align uniquely to a file in the output directory.
6435 Written reads will appear as they did in the input, without any of the translation of quality
6436 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
6437 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
6438 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
6439
6440 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
6441 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
6442 to create it first. The path to the output folder can be either relative or absolute.
6443
6444 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
6445 the specified folder does not exist, Bismark will attempt to create it first. The path to the
6446 temporary folder can be either relative or absolute.
6447
6448
6449
6450 Other:
6451
6452 -h/--help Displays this help file.
6453
6454 -v/--version Displays version information.
6455
6456
6457 BOWTIE 2 SPECIFIC OPTIONS
6458
6459 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
6460 alignments, i.e. searches for alignments involving all read characters (also called
6461 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
6462 and/or quality trimmed where appropriate. Default: off.
6463
6464 Bowtie 2 alignment options:
6465
6466 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
6467 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
6468 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
6469 Bowtie 1 see -n).
6470
6471 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
6472 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
6473 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
6474 Bowtie 1 see -l).
6475
6476 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
6477 position to be the highest possible, regardless of the actual value. I.e. input is treated
6478 as though all quality values are high. This is also the default behavior when the input
6479 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
6480
6481
6482 Bowtie 2 paired-end options:
6483
6484 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
6485 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
6486 and on by default.
6487
6488 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
6489 A discordant alignment is an alignment where both mates align uniquely, but that does not
6490 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
6491 and it is on by default.
6492
6493
6494 Bowtie 2 effort options:
6495
6496 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
6497 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
6498 new second-best alignment. Default: 15.
6499
6500 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
6501 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
6502 mismatches allowed) at different offsets and searches for more alignments. A read is considered
6503 to have repetitive seeds if the total number of seed hits divided by the number of seeds
6504 that aligned at least once is greater than 300. Default: 2.
6505
6506 Bowtie 2 parallelization options:
6507
6508
6509 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
6510 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
6511 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
6512 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
6513 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
6514 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
6515 automatically use the option '--reorder', which guarantees that output SAM records are printed in
6516 an order corresponding to the order of the reads in the original input file, even when -p is set
6517 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
6518 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
6519 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
6520 correspond to input order in that case.
6521
6522 Bowtie 2 Scoring options:
6523
6524 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
6525 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
6526 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
6527 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
6528 L,0,-0.2.
6529
6530 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty
6531 of <int1> + N * <int2>. Default: 5, 3.
6532
6533 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets
6534 a penalty of <int1> + N * <int2>. Default: 5, 3.
6535
6536
6537 Bowtie 2 Reporting options:
6538
6539 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
6540 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
6541 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
6542 effort expended to find valid alignments.
6543
6544 For reference, this used to be the old (now deprecated) description of -M:
6545 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
6546 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
6547 happens first. Only the best alignment is reported. Information from the other alignments is used to
6548 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
6549 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
6550 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
6551 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
6552 always used and its default value is set to 10.
6553
6554
6555 'VANILLA' Bismark OUTPUT:
6556
6557 Single-end output format (tab-separated):
6558
6559 (1) <seq-ID>
6560 (2) <read alignment strand>
6561 (3) <chromosome>
6562 (4) <start position>
6563 (5) <end position>
6564 (6) <observed bisulfite sequence>
6565 (7) <equivalent genomic sequence>
6566 (8) <methylation call>
6567 (9) <read conversion
6568 (10) <genome conversion>
6569 (11) <read quality score (Phred33)>
6570
6571
6572 Paired-end output format (tab-separated):
6573 (1) <seq-ID>
6574 (2) <read 1 alignment strand>
6575 (3) <chromosome>
6576 (4) <start position>
6577 (5) <end position>
6578 (6) <observed bisulfite sequence 1>
6579 (7) <equivalent genomic sequence 1>
6580 (8) <methylation call 1>
6581 (9) <observed bisulfite sequence 2>
6582 (10) <equivalent genomic sequence 2>
6583 (11) <methylation call 2>
6584 (12) <read 1 conversion
6585 (13) <genome conversion>
6586 (14) <read 1 quality score (Phred33)>
6587 (15) <read 2 quality score (Phred33)>
6588
6589
6590 Bismark SAM OUTPUT (default):
6591
6592 (1) QNAME (seq-ID)
6593 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
6594 (3) RNAME (chromosome)
6595 (4) POS (start position)
6596 (5) MAPQ (always 255)
6597 (6) CIGAR
6598 (7) RNEXT
6599 (8) PNEXT
6600 (9) TLEN
6601 (10) SEQ
6602 (11) QUAL (Phred33 scale)
6603 (12) NM-tag (edit distance to the reference)
6604 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
6605 (14) XM-tag (methylation call string)
6606 (15) XR-tag (read conversion state for the alignment)
6607 (16) XG-tag (genome conversion state for the alignment)
6608
6609 Each read of paired-end alignments is written out in a separate line in the above format.
6610
6611
6612 This script was last edited on 21 Aug 2012.
6613
6614 HOW_TO
6615 }