Mercurial > repos > bgruening > bismark
comparison old/bismark @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:0f8646f22b8d | 7:fcadce4d9a06 |
---|---|
1 #!/usr/bin/perl -- | |
2 use strict; | |
3 use warnings; | |
4 use IO::Handle; | |
5 use Cwd; | |
6 $|++; | |
7 use Getopt::Long; | |
8 | |
9 | |
10 ## This program is Copyright (C) 2010-13, Felix Krueger (felix.krueger@babraham.ac.uk) | |
11 | |
12 ## This program is free software: you can redistribute it and/or modify | |
13 ## it under the terms of the GNU General Public License as published by | |
14 ## the Free Software Foundation, either version 3 of the License, or | |
15 ## (at your option) any later version. | |
16 | |
17 ## This program is distributed in the hope that it will be useful, | |
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 ## GNU General Public License for more details. | |
21 | |
22 ## You should have received a copy of the GNU General Public License | |
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
24 | |
25 | |
26 my $parent_dir = getcwd; | |
27 my $bismark_version = 'v0.10.0'; | |
28 my $command_line = join (" ",@ARGV); | |
29 | |
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail | |
31 foreach my $arg (@ARGV){ | |
32 if ($arg eq '--solexa1.3-quals'){ | |
33 $arg = '--phred64-quals'; | |
34 } | |
35 } | |
36 my @filenames; # will be populated by processing the command line | |
37 | |
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag) = process_command_line(); | |
39 | |
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment | |
41 my %chromosomes; # stores the chromosome sequences of the mouse genome | |
42 my %counting; # counting various events | |
43 | |
44 my $seqID_contains_tabs; | |
45 | |
46 foreach my $filename (@filenames){ | |
47 | |
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n"; | |
49 ### resetting the counting hash and fhs | |
50 reset_counters_and_fhs($filename); | |
51 $seqID_contains_tabs = 0; | |
52 | |
53 ### PAIRED-END ALIGNMENTS | |
54 if ($filename =~ ','){ | |
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file | |
56 | |
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome'; | |
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome'; | |
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome'; | |
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome'; | |
61 | |
62 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n"; | |
63 | |
64 my ($filename_1,$filename_2) = (split (/,/,$filename)); | |
65 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n"; | |
66 | |
67 ### additional variables only for paired-end alignments | |
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file | |
69 | |
70 ### FastA format | |
71 if ($sequence_file_format eq 'FASTA'){ | |
72 warn "Input files are in FastA format\n"; | |
73 | |
74 if ($directional){ | |
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number | |
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); | |
77 | |
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
80 $fhs[1]->{inputfile_1} = undef; | |
81 $fhs[1]->{inputfile_2} = undef; | |
82 $fhs[2]->{inputfile_1} = undef; | |
83 $fhs[2]->{inputfile_2} = undef; | |
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
86 } | |
87 else{ | |
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number | |
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); | |
90 | |
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
99 } | |
100 | |
101 if ($bowtie2){ | |
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
103 } | |
104 else{ | |
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
106 } | |
107 } | |
108 | |
109 ### FastQ format | |
110 else{ | |
111 warn "Input files are in FastQ format\n"; | |
112 if ($directional){ | |
113 if ($bowtie2){ | |
114 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
115 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
116 | |
117 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
118 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
119 $fhs[1]->{inputfile_1} = undef; | |
120 $fhs[1]->{inputfile_2} = undef; | |
121 $fhs[2]->{inputfile_1} = undef; | |
122 $fhs[2]->{inputfile_2} = undef; | |
123 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
124 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
125 } | |
126 else{ # Bowtie 1 alignments | |
127 if ($gzip){ | |
128 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time | |
129 | |
130 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format | |
131 $fhs[0]->{inputfile_2} = undef; # no longer needed | |
132 $fhs[1]->{inputfile_1} = undef; | |
133 $fhs[1]->{inputfile_2} = undef; | |
134 $fhs[2]->{inputfile_1} = undef; | |
135 $fhs[2]->{inputfile_2} = undef; | |
136 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format | |
137 $fhs[3]->{inputfile_2} = undef; # no longer needed | |
138 } | |
139 else{ | |
140 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
141 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
142 | |
143 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
144 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
145 $fhs[1]->{inputfile_1} = undef; | |
146 $fhs[1]->{inputfile_2} = undef; | |
147 $fhs[2]->{inputfile_1} = undef; | |
148 $fhs[2]->{inputfile_2} = undef; | |
149 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
150 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
151 } | |
152 } | |
153 } | |
154 elsif($pbat){ # PBAT-Seq | |
155 ### At the moment we are only performing uncompressed FastQ alignments with Bowtie1 | |
156 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
157 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
158 | |
159 $fhs[0]->{inputfile_1} = undef; | |
160 $fhs[0]->{inputfile_2} = undef; | |
161 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
162 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
163 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
164 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
165 $fhs[3]->{inputfile_1} = undef; | |
166 $fhs[3]->{inputfile_2} = undef; | |
167 } | |
168 else{ | |
169 if ($bowtie2){ | |
170 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
171 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
172 | |
173 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
174 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
175 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
176 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
177 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
178 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
179 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
180 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
181 } | |
182 else{ # Bowtie 1 alignments | |
183 if ($gzip){ | |
184 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time | |
185 | |
186 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
187 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files | |
188 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
189 $fhs[1]->{inputfile_2} = undef; | |
190 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
191 $fhs[2]->{inputfile_2} = undef; | |
192 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
193 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files | |
194 } | |
195 else{ #uncompressed temp files | |
196 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
197 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
198 | |
199 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
200 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
201 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
202 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
203 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
204 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
205 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
206 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
207 } | |
208 } | |
209 } | |
210 if ($bowtie2){ | |
211 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
212 } | |
213 else{ | |
214 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
215 } | |
216 } | |
217 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
218 } | |
219 | |
220 ### Else we are performing SINGLE-END ALIGNMENTS | |
221 else{ | |
222 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n"; | |
223 ### Initialising bisulfite conversion filenames | |
224 my ($C_to_T_infile,$G_to_A_infile); | |
225 | |
226 | |
227 ### FastA format | |
228 if ($sequence_file_format eq 'FASTA'){ | |
229 warn "Inut file is in FastA format\n"; | |
230 if ($directional){ | |
231 ($C_to_T_infile) = biTransformFastAFiles ($filename); | |
232 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
233 } | |
234 else{ | |
235 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename); | |
236 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
237 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; | |
238 } | |
239 | |
240 ### Creating 4 different bowtie filehandles and storing the first entry | |
241 if ($bowtie2){ | |
242 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile); | |
243 } | |
244 else{ | |
245 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile); | |
246 } | |
247 } | |
248 | |
249 ## FastQ format | |
250 else{ | |
251 warn "Input file is in FastQ format\n"; | |
252 if ($directional){ | |
253 ($C_to_T_infile) = biTransformFastQFiles ($filename); | |
254 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
255 } | |
256 elsif($pbat){ | |
257 ($G_to_A_infile) = biTransformFastQFiles ($filename); | |
258 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files | |
259 } | |
260 else{ | |
261 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename); | |
262 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
263 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; | |
264 } | |
265 | |
266 ### Creating up to 4 different bowtie filehandles and storing the first entry | |
267 if ($bowtie2){ | |
268 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile); | |
269 } | |
270 elsif ($pbat){ | |
271 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile); | |
272 } | |
273 else{ | |
274 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile); | |
275 } | |
276 } | |
277 | |
278 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile); | |
279 | |
280 } | |
281 } | |
282 | |
283 sub start_methylation_call_procedure_single_ends { | |
284 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; | |
285 my ($dir,$filename); | |
286 | |
287 if ($sequence_file =~ /\//){ | |
288 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/; | |
289 } | |
290 else{ | |
291 $filename = $sequence_file; | |
292 } | |
293 | |
294 ### printing all alignments to a results file | |
295 my $outfile = $filename; | |
296 if ($prefix){ | |
297 $outfile = "$prefix.$outfile"; | |
298 } | |
299 | |
300 | |
301 if ($bowtie2){ # SAM format is the default for Bowtie 2 | |
302 $outfile =~ s/$/_bismark_bt2.sam/; | |
303 } | |
304 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) | |
305 $outfile =~ s/$/_bismark.txt/; | |
306 } | |
307 else{ # SAM is the default output | |
308 $outfile =~ s/$/_bismark.sam/; | |
309 } | |
310 | |
311 $bam = 0 unless (defined $bam); | |
312 | |
313 if ($bam == 1){ ### Samtools is installed, writing out BAM directly | |
314 $outfile =~ s/sam/bam/; | |
315 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
316 } | |
317 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead | |
318 $outfile .= '.gz'; | |
319 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
320 } | |
321 else{ # uncompressed ouput, default | |
322 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
323 } | |
324 | |
325 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n"; | |
326 sleep(1); | |
327 | |
328 if ($vanilla){ | |
329 print OUT "Bismark version: $bismark_version\n"; | |
330 } | |
331 | |
332 ### printing alignment and methylation call summary to a report file | |
333 my $reportfile = $filename; | |
334 if ($prefix){ | |
335 $reportfile = "$prefix.$reportfile"; | |
336 } | |
337 | |
338 if ($bowtie2){ | |
339 $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; | |
340 } | |
341 else{ | |
342 $reportfile =~ s/$/_bismark_SE_report.txt/; | |
343 } | |
344 | |
345 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; | |
346 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; | |
347 | |
348 if ($unmapped){ | |
349 my $unmapped_file = $filename; | |
350 if ($prefix){ | |
351 $unmapped_file = "$prefix.$unmapped_file"; | |
352 } | |
353 | |
354 $unmapped_file =~ s/$/_unmapped_reads.txt/; | |
355 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; | |
356 print "Unmapped sequences will be written to $output_dir$unmapped_file\n"; | |
357 } | |
358 if ($ambiguous){ | |
359 my $ambiguous_file = $filename; | |
360 if ($prefix){ | |
361 $ambiguous_file = "$prefix.$ambiguous_file"; | |
362 } | |
363 $ambiguous_file =~ s/$/_ambiguous_reads.txt/; | |
364 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; | |
365 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; | |
366 } | |
367 | |
368 if ($directional){ | |
369 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n"; | |
370 } | |
371 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
372 | |
373 | |
374 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time | |
375 unless (%chromosomes){ | |
376 my $cwd = getcwd; # storing the path of the current working directory | |
377 print "Current working directory is: $cwd\n\n"; | |
378 read_genome_into_memory($cwd); | |
379 } | |
380 | |
381 unless ($vanilla or $sam_no_hd){ | |
382 generate_SAM_header(); | |
383 } | |
384 | |
385 ### Input file is in FastA format | |
386 if ($sequence_file_format eq 'FASTA'){ | |
387 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); | |
388 } | |
389 ### Input file is in FastQ format | |
390 else{ | |
391 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile); | |
392 } | |
393 } | |
394 | |
395 sub start_methylation_call_procedure_paired_ends { | |
396 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
397 | |
398 my ($dir_1,$filename_1); | |
399 | |
400 if ($sequence_file_1 =~ /\//){ | |
401 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/; | |
402 } | |
403 else{ | |
404 $filename_1 = $sequence_file_1; | |
405 } | |
406 | |
407 my ($dir_2,$filename_2); | |
408 | |
409 if ($sequence_file_2 =~ /\//){ | |
410 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/; | |
411 } | |
412 else{ | |
413 $filename_2 = $sequence_file_2; | |
414 } | |
415 | |
416 ### printing all alignments to a results file | |
417 my $outfile = $filename_1; | |
418 | |
419 if ($prefix){ | |
420 $outfile = "$prefix.$outfile"; | |
421 } | |
422 | |
423 if ($bowtie2){ # SAM format is the default Bowtie 2 output | |
424 $outfile =~ s/$/_bismark_bt2_pe.sam/; | |
425 } | |
426 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) | |
427 $outfile =~ s/$/_bismark_pe.txt/; | |
428 } | |
429 else{ # SAM format is the default Bowtie 1 output | |
430 $outfile =~ s/$/_bismark_pe.sam/; | |
431 } | |
432 | |
433 $bam = 0 unless (defined $bam); | |
434 | |
435 if ($bam == 1){ ### Samtools is installed, writing out BAM directly | |
436 $outfile =~ s/sam/bam/; | |
437 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
438 } | |
439 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead | |
440 $outfile .= '.gz'; | |
441 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
442 } | |
443 else{ # uncompressed ouput, default | |
444 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
445 } | |
446 | |
447 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n"; | |
448 sleep(1); | |
449 | |
450 if ($vanilla){ | |
451 print OUT "Bismark version: $bismark_version\n"; | |
452 } | |
453 | |
454 ### printing alignment and methylation call summary to a report file | |
455 my $reportfile = $filename_1; | |
456 if ($prefix){ | |
457 $reportfile = "$prefix.$reportfile"; | |
458 } | |
459 | |
460 if ($bowtie2){ | |
461 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; | |
462 } | |
463 else{ | |
464 $reportfile =~ s/$/_bismark_PE_report.txt/; | |
465 } | |
466 | |
467 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; | |
468 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n"; | |
469 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
470 | |
471 | |
472 ### Unmapped read output | |
473 if ($unmapped){ | |
474 my $unmapped_1 = $filename_1; | |
475 my $unmapped_2 = $filename_2; | |
476 if ($prefix){ | |
477 $unmapped_1 = "$prefix.$unmapped_1"; | |
478 $unmapped_2 = "$prefix.$unmapped_2"; | |
479 } | |
480 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/; | |
481 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/; | |
482 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; | |
483 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; | |
484 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; | |
485 } | |
486 | |
487 if ($ambiguous){ | |
488 my $amb_1 = $filename_1; | |
489 my $amb_2 = $filename_2; | |
490 if ($prefix){ | |
491 $amb_1 = "$prefix.$amb_1"; | |
492 $amb_2 = "$prefix.$amb_2"; | |
493 } | |
494 | |
495 $amb_1 =~ s/$/_ambiguous_reads_1.txt/; | |
496 $amb_2 =~ s/$/_ambiguous_reads_2.txt/; | |
497 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; | |
498 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; | |
499 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; | |
500 } | |
501 | |
502 if ($directional){ | |
503 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n"; | |
504 } | |
505 | |
506 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time | |
507 unless (%chromosomes){ | |
508 my $cwd = getcwd; # storing the path of the current working directory | |
509 print "Current working directory is: $cwd\n\n"; | |
510 read_genome_into_memory($cwd); | |
511 } | |
512 | |
513 unless ($vanilla or $sam_no_hd){ | |
514 generate_SAM_header(); | |
515 } | |
516 | |
517 ### Input files are in FastA format | |
518 if ($sequence_file_format eq 'FASTA'){ | |
519 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
520 } | |
521 ### Input files are in FastQ format | |
522 else{ | |
523 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
524 } | |
525 } | |
526 | |
527 sub print_final_analysis_report_single_end{ | |
528 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
529 ### All sequences from the original sequence file have been analysed now | |
530 ### deleting temporary C->T or G->A infiles | |
531 | |
532 if ($directional){ | |
533 my $deletion_successful = unlink "$temp_dir$C_to_T_infile"; | |
534 if ($deletion_successful == 1){ | |
535 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n"; | |
536 } | |
537 else{ | |
538 warn "Could not delete temporary file $C_to_T_infile properly $!\n"; | |
539 } | |
540 } | |
541 elsif ($pbat){ | |
542 my $deletion_successful = unlink "$temp_dir$G_to_A_infile"; | |
543 if ($deletion_successful == 1){ | |
544 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n"; | |
545 } | |
546 else{ | |
547 warn "Could not delete temporary file $G_to_A_infile properly $!\n"; | |
548 } | |
549 } | |
550 else{ | |
551 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile"; | |
552 if ($deletion_successful == 2){ | |
553 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n"; | |
554 } | |
555 else{ | |
556 warn "Could not delete temporary files properly $!\n"; | |
557 } | |
558 } | |
559 | |
560 ### printing a final report for the alignment procedure | |
561 print REPORT "Final Alignment report\n",'='x22,"\n"; | |
562 warn "Final Alignment report\n",'='x22,"\n"; | |
563 # foreach my $index (0..$#fhs){ | |
564 # print "$fhs[$index]->{name}\n"; | |
565 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; | |
566 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; | |
567 # } | |
568 | |
569 ### printing a final report for the methylation call procedure | |
570 warn "Sequences analysed in total:\t$counting{sequences_count}\n"; | |
571 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n"; | |
572 my $percent_alignable_sequences; | |
573 | |
574 if ($counting{sequences_count} == 0){ | |
575 $percent_alignable_sequences = 0; | |
576 } | |
577 else{ | |
578 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); | |
579 } | |
580 | |
581 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n"; | |
582 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n"; | |
583 | |
584 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads), | |
585 ### only calculating the percentage if there were any overruled alignments | |
586 if ($counting{low_complexity_alignments_overruled_count}){ | |
587 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count}); | |
588 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n"; | |
589 } | |
590 | |
591 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
592 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
593 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
594 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; | |
595 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; | |
596 | |
597 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
598 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
599 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
600 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; | |
601 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; | |
602 | |
603 if ($directional){ | |
604 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
605 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
606 } | |
607 | |
608 ### detailed information about Cs analysed | |
609 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
610 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; | |
611 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
612 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
613 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
614 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
615 if ($bowtie2){ | |
616 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; | |
617 } | |
618 warn "\n"; | |
619 | |
620 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
621 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
622 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
623 if ($bowtie2){ | |
624 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
625 } | |
626 warn "\n"; | |
627 | |
628 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
629 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
630 | |
631 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
632 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
633 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
634 if ($bowtie2){ | |
635 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; | |
636 } | |
637 print REPORT "\n"; | |
638 | |
639 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
640 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
641 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
642 if ($bowtie2){ | |
643 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
644 } | |
645 print REPORT "\n"; | |
646 | |
647 my $percent_meCHG; | |
648 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
649 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
650 } | |
651 | |
652 my $percent_meCHH; | |
653 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
654 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
655 } | |
656 | |
657 my $percent_meCpG; | |
658 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
659 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
660 } | |
661 | |
662 my $percent_meC_unknown; | |
663 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ | |
664 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); | |
665 } | |
666 | |
667 | |
668 ### printing methylated CpG percentage if applicable | |
669 if ($percent_meCpG){ | |
670 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
671 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
672 } | |
673 else{ | |
674 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
675 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
676 } | |
677 | |
678 ### printing methylated C percentage (CHG context) if applicable | |
679 if ($percent_meCHG){ | |
680 warn "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
681 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
682 } | |
683 else{ | |
684 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
685 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
686 } | |
687 | |
688 ### printing methylated C percentage (CHH context) if applicable | |
689 if ($percent_meCHH){ | |
690 warn "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
691 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
692 } | |
693 else{ | |
694 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
695 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
696 } | |
697 | |
698 ### printing methylated C percentage (Unknown C context) if applicable | |
699 if ($bowtie2){ | |
700 if ($percent_meC_unknown){ | |
701 warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
702 print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
703 } | |
704 else{ | |
705 warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; | |
706 print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; | |
707 } | |
708 } | |
709 print REPORT "\n\n"; | |
710 warn "\n\n"; | |
711 | |
712 if ($seqID_contains_tabs){ | |
713 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; | |
714 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; | |
715 } | |
716 | |
717 | |
718 ########################################################################################################################################### | |
719 ### create pie-chart with mapping stats | |
720 ########################################################################################################################################### | |
721 | |
722 | |
723 my $filename; | |
724 if ($pbat){ | |
725 $filename = $G_to_A_infile; | |
726 } | |
727 else{ | |
728 $filename = $C_to_T_infile; | |
729 } | |
730 | |
731 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
732 $pie_chart =~ s/gz$//; | |
733 $pie_chart =~ s/_C_to_T\.fastq$//; | |
734 $pie_chart =~ s/_G_to_A\.fastq$//; | |
735 | |
736 # if ($prefix){ | |
737 # $pie_chart = "$prefix.$pie_chart"; # this is now being taken care of in file transformation | |
738 # } | |
739 $pie_chart = "${output_dir}${pie_chart}_bismark_SE.alignment_overview.png"; | |
740 | |
741 | |
742 #Check whether the module GD::Graph is installed | |
743 my $gd_graph_installed = 0; | |
744 eval{ | |
745 require GD::Graph::pie; | |
746 GD::Graph::pie->import(); | |
747 }; | |
748 | |
749 unless($@) { | |
750 $gd_graph_installed = 1; | |
751 } | |
752 else{ | |
753 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; | |
754 sleep(2); | |
755 } | |
756 | |
757 if ($gd_graph_installed){ | |
758 warn "Generating pie chart\n\n"; | |
759 sleep(1); | |
760 my $graph = GD::Graph::pie->new(600,600); | |
761 | |
762 my $percent_unaligned; | |
763 my $percent_multiple; | |
764 my $percent_unextractable; | |
765 | |
766 if ($counting{sequences_count}){ | |
767 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); | |
768 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); | |
769 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); | |
770 } | |
771 else{ | |
772 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; | |
773 } | |
774 | |
775 my @aln_stats = ( | |
776 ["Uniquely aligned $percent_alignable_sequences%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], | |
777 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], | |
778 ); | |
779 | |
780 $graph->set( | |
781 start_angle => 180, | |
782 '3d' => 0, | |
783 label => 'Alignment stats (single-end)', | |
784 suppress_angle => 2, # Only label slices of sufficient size | |
785 transparent => 0, | |
786 dclrs => [ qw(red lorange dgreen cyan) ], | |
787 ) or die $graph->error; | |
788 | |
789 my $gd = $graph->plot(\@aln_stats) or die $graph->error; | |
790 | |
791 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; | |
792 binmode PIE; | |
793 print PIE $gd->png; | |
794 } | |
795 | |
796 warn "====================\nBismark run complete\n====================\n\n"; | |
797 | |
798 } | |
799 | |
800 | |
801 sub print_final_analysis_report_paired_ends{ | |
802 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
803 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles | |
804 if ($directional){ | |
805 if ($G_to_A_infile_2){ | |
806 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2"; | |
807 if ($deletion_successful == 2){ | |
808 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n"; | |
809 } | |
810 else{ | |
811 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n"; | |
812 } | |
813 } | |
814 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete | |
815 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1"; | |
816 if ($deletion_successful == 1){ | |
817 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n"; | |
818 } | |
819 else{ | |
820 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n"; | |
821 } | |
822 } | |
823 } | |
824 else{ | |
825 if ($G_to_A_infile_2 and $C_to_T_infile_2){ | |
826 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2"; | |
827 if ($deletion_successful == 4){ | |
828 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n"; | |
829 } | |
830 else{ | |
831 warn "Could not delete temporary files properly: $!\n"; | |
832 } | |
833 } | |
834 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete | |
835 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1"; | |
836 if ($deletion_successful == 2){ | |
837 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n"; | |
838 } | |
839 else{ | |
840 warn "Could not delete temporary files properly: $!\n"; | |
841 } | |
842 } | |
843 } | |
844 | |
845 ### printing a final report for the alignment procedure | |
846 warn "Final Alignment report\n",'='x22,"\n"; | |
847 print REPORT "Final Alignment report\n",'='x22,"\n"; | |
848 # foreach my $index (0..$#fhs){ | |
849 # print "$fhs[$index]->{name}\n"; | |
850 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; | |
851 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; | |
852 # } | |
853 | |
854 ### printing a final report for the methylation call procedure | |
855 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; | |
856 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; | |
857 | |
858 my $percent_alignable_sequence_pairs; | |
859 if ($counting{sequences_count} == 0){ | |
860 $percent_alignable_sequence_pairs = 0; | |
861 } | |
862 else{ | |
863 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); | |
864 } | |
865 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n"; | |
866 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n"; | |
867 | |
868 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
869 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
870 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
871 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; | |
872 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; | |
873 | |
874 | |
875 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
876 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
877 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
878 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; | |
879 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; | |
880 ### detailed information about Cs analysed | |
881 | |
882 if ($directional){ | |
883 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
884 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
885 } | |
886 | |
887 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
888 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
889 | |
890 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
891 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
892 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
893 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
894 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
895 if ($bowtie2){ | |
896 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; | |
897 } | |
898 warn "\n"; | |
899 | |
900 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
901 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
902 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
903 if ($bowtie2){ | |
904 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
905 } | |
906 warn "\n"; | |
907 | |
908 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
909 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
910 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
911 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
912 if ($bowtie2){ | |
913 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; | |
914 } | |
915 print REPORT "\n"; | |
916 | |
917 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
918 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
919 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
920 if ($bowtie2){ | |
921 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; | |
922 } | |
923 print REPORT "\n"; | |
924 | |
925 my $percent_meCHG; | |
926 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
927 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
928 } | |
929 | |
930 my $percent_meCHH; | |
931 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
932 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
933 } | |
934 | |
935 my $percent_meCpG; | |
936 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
937 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
938 } | |
939 | |
940 my $percent_meC_unknown; | |
941 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ | |
942 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); | |
943 } | |
944 | |
945 | |
946 ### printing methylated CpG percentage if applicable | |
947 if ($percent_meCpG){ | |
948 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
949 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
950 } | |
951 else{ | |
952 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
953 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
954 } | |
955 | |
956 ### printing methylated C percentage in CHG context if applicable | |
957 if ($percent_meCHG){ | |
958 warn "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
959 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
960 } | |
961 else{ | |
962 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
963 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
964 } | |
965 | |
966 ### printing methylated C percentage in CHH context if applicable | |
967 if ($percent_meCHH){ | |
968 warn "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
969 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
970 } | |
971 else{ | |
972 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
973 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
974 } | |
975 | |
976 ### printing methylated C percentage (Unknown C context) if applicable | |
977 if ($bowtie2){ | |
978 if ($percent_meC_unknown){ | |
979 warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
980 print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
981 } | |
982 else{ | |
983 warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; | |
984 print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; | |
985 } | |
986 } | |
987 print REPORT "\n\n"; | |
988 warn "\n\n"; | |
989 | |
990 | |
991 ############################################################################################################################################ | |
992 ### create pie-chart with mapping stats | |
993 ########################################################################################################################################### | |
994 | |
995 my $filename; | |
996 if ($pbat){ | |
997 $filename = $G_to_A_infile_1; | |
998 } | |
999 else{ | |
1000 $filename = $C_to_T_infile_1; | |
1001 } | |
1002 | |
1003 my $pie_chart = (split (/\//,$filename))[-1]; # extracting the filename if a full path was specified | |
1004 $pie_chart =~ s/gz$//; | |
1005 $pie_chart =~ s/_C_to_T.fastq$//; | |
1006 $pie_chart =~ s/_G_to_A.fastq$//; | |
1007 ### special format for gzipped PE Bowtie1 files | |
1008 $pie_chart =~ s/\.CT_plus_GA\.fastq\.$//; | |
1009 $pie_chart =~ s/\.GA_plus_CT\.fastq\.$//; | |
1010 | |
1011 if ($prefix){ | |
1012 # prefix is now being prepended to the temp files already | |
1013 # $pie_chart = "$prefix.$pie_chart"; | |
1014 } | |
1015 $pie_chart = "${output_dir}${pie_chart}_bismark_PE.alignment_overview.png"; | |
1016 | |
1017 #Check whether the module GD::Graph is installed | |
1018 my $gd_graph_installed = 0; | |
1019 eval{ | |
1020 require GD::Graph::pie; | |
1021 GD::Graph::pie->import(); | |
1022 }; | |
1023 | |
1024 unless($@) { | |
1025 $gd_graph_installed = 1; | |
1026 } | |
1027 else{ | |
1028 warn "Perl module GD::Graph::pie is not installed, skipping graphical alignment summary\n"; | |
1029 sleep(2); | |
1030 } | |
1031 | |
1032 if ($gd_graph_installed){ | |
1033 warn "Generating pie chart\n\n"; | |
1034 sleep(1); | |
1035 my $graph = GD::Graph::pie->new(600,600); | |
1036 | |
1037 my $percent_unaligned; | |
1038 my $percent_multiple; | |
1039 my $percent_unextractable; | |
1040 | |
1041 if ($counting{sequences_count}){ | |
1042 $percent_unaligned = sprintf ("%.1f",$counting{no_single_alignment_found}*100/$counting{sequences_count}); | |
1043 $percent_multiple = sprintf ("%.1f",$counting{unsuitable_sequence_count}*100/$counting{sequences_count}); | |
1044 $percent_unextractable = sprintf ("%.1f",$counting{genomic_sequence_could_not_be_extracted_count}*100/$counting{sequences_count}); | |
1045 } | |
1046 else{ | |
1047 $percent_unaligned = $percent_multiple = $percent_unextractable = 'N/A'; | |
1048 } | |
1049 | |
1050 my @aln_stats = ( | |
1051 ["Uniquely aligned pairs $percent_alignable_sequence_pairs%","Unaligned $percent_unaligned%","Multiple alignments $percent_multiple%","sequence unextractable $percent_unextractable%"], | |
1052 [$counting{unique_best_alignment_count},$counting{no_single_alignment_found},$counting{unsuitable_sequence_count},$counting{genomic_sequence_could_not_be_extracted_count}], | |
1053 ); | |
1054 | |
1055 # push @{$mbias_read1[0]},$pos; | |
1056 | |
1057 $graph->set( | |
1058 start_angle => 180, | |
1059 '3d' => 0, | |
1060 label => 'Alignment stats (paired-end)', | |
1061 suppress_angle => 2, # Only label slices of sufficient size | |
1062 transparent => 0, | |
1063 dclrs => [ qw(red lorange dgreen cyan) ], | |
1064 ) or die $graph->error; | |
1065 | |
1066 my $gd = $graph->plot(\@aln_stats) or die $graph->error; | |
1067 | |
1068 open (PIE,'>',$pie_chart) or die "Failed to write to file for alignments pie chart: $!\n\n"; | |
1069 binmode PIE; | |
1070 print PIE $gd->png; | |
1071 } | |
1072 | |
1073 warn "====================\nBismark run complete\n====================\n\n"; | |
1074 | |
1075 } | |
1076 | |
1077 sub process_single_end_fastA_file_for_methylation_call{ | |
1078 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; | |
1079 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. | |
1080 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either | |
1081 ### the C->T or G->A version | |
1082 | |
1083 ### gzipped version of the infile | |
1084 if ($sequence_file =~ /\.gz$/){ | |
1085 open (IN,"zcat $sequence_file |") or die $!; | |
1086 } | |
1087 else{ | |
1088 open (IN,$sequence_file) or die $!; | |
1089 } | |
1090 | |
1091 my $count = 0; | |
1092 | |
1093 warn "\nReading in the sequence file $sequence_file\n"; | |
1094 while (1) { | |
1095 # last if ($counting{sequences_count} > 100); | |
1096 my $identifier = <IN>; | |
1097 my $sequence = <IN>; | |
1098 last unless ($identifier and $sequence); | |
1099 | |
1100 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
1101 | |
1102 ++$count; | |
1103 | |
1104 if ($skip){ | |
1105 next unless ($count > $skip); | |
1106 } | |
1107 if ($upto){ | |
1108 last if ($count > $upto); | |
1109 } | |
1110 | |
1111 $counting{sequences_count}++; | |
1112 if ($counting{sequences_count}%1000000==0) { | |
1113 warn "Processed $counting{sequences_count} sequences so far\n"; | |
1114 } | |
1115 chomp $sequence; | |
1116 chomp $identifier; | |
1117 | |
1118 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers | |
1119 | |
1120 my $return; | |
1121 if ($bowtie2){ | |
1122 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier); | |
1123 } | |
1124 else{ | |
1125 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1 | |
1126 } | |
1127 | |
1128 unless ($return){ | |
1129 $return = 0; | |
1130 } | |
1131 | |
1132 # print the sequence to ambiguous.out if --ambiguous was specified | |
1133 if ($ambiguous and $return == 2){ | |
1134 print AMBIG ">$identifier\n"; | |
1135 print AMBIG "$sequence\n"; | |
1136 } | |
1137 | |
1138 # print the sequence to <unmapped.out> file if --un was specified | |
1139 elsif ($unmapped and $return == 1){ | |
1140 print UNMAPPED ">$identifier\n"; | |
1141 print UNMAPPED "$sequence\n"; | |
1142 } | |
1143 } | |
1144 print "Processed $counting{sequences_count} sequences in total\n\n"; | |
1145 | |
1146 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); | |
1147 | |
1148 } | |
1149 | |
1150 sub process_single_end_fastQ_file_for_methylation_call{ | |
1151 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_; | |
1152 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. | |
1153 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either | |
1154 ### the C->T or G->A version | |
1155 | |
1156 ### gzipped version of the infile | |
1157 if ($sequence_file =~ /\.gz$/){ | |
1158 open (IN,"zcat $sequence_file |") or die $!; | |
1159 } | |
1160 else{ | |
1161 open (IN,$sequence_file) or die $!; | |
1162 } | |
1163 | |
1164 my $count = 0; | |
1165 | |
1166 warn "\nReading in the sequence file $sequence_file\n"; | |
1167 while (1) { | |
1168 my $identifier = <IN>; | |
1169 my $sequence = <IN>; | |
1170 my $identifier_2 = <IN>; | |
1171 my $quality_value = <IN>; | |
1172 last unless ($identifier and $sequence and $identifier_2 and $quality_value); | |
1173 | |
1174 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
1175 | |
1176 ++$count; | |
1177 | |
1178 if ($skip){ | |
1179 next unless ($count > $skip); | |
1180 } | |
1181 if ($upto){ | |
1182 last if ($count > $upto); | |
1183 } | |
1184 | |
1185 $counting{sequences_count}++; | |
1186 | |
1187 if ($counting{sequences_count}%1000000==0) { | |
1188 warn "Processed $counting{sequences_count} sequences so far\n"; | |
1189 } | |
1190 chomp $sequence; | |
1191 chomp $identifier; | |
1192 chomp $quality_value; | |
1193 | |
1194 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers | |
1195 | |
1196 my $return; | |
1197 if ($bowtie2){ | |
1198 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value); | |
1199 } | |
1200 else{ | |
1201 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1 | |
1202 } | |
1203 | |
1204 unless ($return){ | |
1205 $return = 0; | |
1206 } | |
1207 | |
1208 # print the sequence to ambiguous.out if --ambiguous was specified | |
1209 if ($ambiguous and $return == 2){ | |
1210 print AMBIG "\@$identifier\n"; | |
1211 print AMBIG "$sequence\n"; | |
1212 print AMBIG $identifier_2; | |
1213 print AMBIG "$quality_value\n"; | |
1214 } | |
1215 | |
1216 # print the sequence to <unmapped.out> file if --un was specified | |
1217 elsif ($unmapped and $return == 1){ | |
1218 print UNMAPPED "\@$identifier\n"; | |
1219 print UNMAPPED "$sequence\n"; | |
1220 print UNMAPPED $identifier_2; | |
1221 print UNMAPPED "$quality_value\n"; | |
1222 } | |
1223 } | |
1224 print "Processed $counting{sequences_count} sequences in total\n\n"; | |
1225 | |
1226 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile); | |
1227 | |
1228 } | |
1229 | |
1230 sub process_fastA_files_for_paired_end_methylation_calls{ | |
1231 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
1232 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to | |
1233 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping. | |
1234 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the | |
1235 ### converted genomes (either the C->T or G->A version) | |
1236 | |
1237 ### gzipped version of the infiles | |
1238 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ | |
1239 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; | |
1240 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; | |
1241 } | |
1242 else{ | |
1243 open (IN1,$sequence_file_1) or die $!; | |
1244 open (IN2,$sequence_file_2) or die $!; | |
1245 } | |
1246 | |
1247 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; | |
1248 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one | |
1249 | |
1250 my $count = 0; | |
1251 | |
1252 while (1) { | |
1253 # reading from the first input file | |
1254 my $identifier_1 = <IN1>; | |
1255 my $sequence_1 = <IN1>; | |
1256 # reading from the second input file | |
1257 my $identifier_2 = <IN2>; | |
1258 my $sequence_2 = <IN2>; | |
1259 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2); | |
1260 | |
1261 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces | |
1262 $identifier_2 = fix_IDs($identifier_2); | |
1263 | |
1264 ++$count; | |
1265 | |
1266 if ($skip){ | |
1267 next unless ($count > $skip); | |
1268 } | |
1269 if ($upto){ | |
1270 last if ($count > $upto); | |
1271 } | |
1272 | |
1273 $counting{sequences_count}++; | |
1274 if ($counting{sequences_count}%1000000==0) { | |
1275 warn "Processed $counting{sequences_count} sequence pairs so far\n"; | |
1276 } | |
1277 my $orig_identifier_1 = $identifier_1; | |
1278 my $orig_identifier_2 = $identifier_2; | |
1279 | |
1280 chomp $sequence_1; | |
1281 chomp $identifier_1; | |
1282 chomp $sequence_2; | |
1283 chomp $identifier_2; | |
1284 | |
1285 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers | |
1286 | |
1287 my $return; | |
1288 if ($bowtie2){ | |
1289 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1); | |
1290 } | |
1291 else{ | |
1292 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1); | |
1293 } | |
1294 | |
1295 unless ($return){ | |
1296 $return = 0; | |
1297 } | |
1298 | |
1299 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified | |
1300 if ($ambiguous and $return == 2){ | |
1301 print AMBIG_1 $orig_identifier_1; | |
1302 print AMBIG_1 "$sequence_1\n"; | |
1303 print AMBIG_2 $orig_identifier_2; | |
1304 print AMBIG_2 "$sequence_2\n"; | |
1305 } | |
1306 | |
1307 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified | |
1308 elsif ($unmapped and $return == 1){ | |
1309 print UNMAPPED_1 $orig_identifier_1; | |
1310 print UNMAPPED_1 "$sequence_1\n"; | |
1311 print UNMAPPED_2 $orig_identifier_2; | |
1312 print UNMAPPED_2 "$sequence_2\n"; | |
1313 } | |
1314 } | |
1315 | |
1316 warn "Processed $counting{sequences_count} sequences in total\n\n"; | |
1317 | |
1318 close OUT or die $!; | |
1319 | |
1320 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
1321 | |
1322 } | |
1323 | |
1324 sub process_fastQ_files_for_paired_end_methylation_calls{ | |
1325 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
1326 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to | |
1327 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments. | |
1328 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both) | |
1329 ### of the converted genomes (either C->T or G->A version) | |
1330 | |
1331 ### gzipped version of the infiles | |
1332 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ | |
1333 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; | |
1334 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; | |
1335 } | |
1336 else{ | |
1337 open (IN1,$sequence_file_1) or die $!; | |
1338 open (IN2,$sequence_file_2) or die $!; | |
1339 } | |
1340 | |
1341 my $count = 0; | |
1342 | |
1343 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; | |
1344 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one | |
1345 while (1) { | |
1346 # reading from the first input file | |
1347 my $identifier_1 = <IN1>; | |
1348 my $sequence_1 = <IN1>; | |
1349 my $ident_1 = <IN1>; # not needed | |
1350 my $quality_value_1 = <IN1>; # not needed | |
1351 # reading from the second input file | |
1352 my $identifier_2 = <IN2>; | |
1353 my $sequence_2 = <IN2>; | |
1354 my $ident_2 = <IN2>; # not needed | |
1355 my $quality_value_2 = <IN2>; # not needed | |
1356 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2); | |
1357 | |
1358 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces | |
1359 $identifier_2 = fix_IDs($identifier_2); | |
1360 | |
1361 ++$count; | |
1362 | |
1363 if ($skip){ | |
1364 next unless ($count > $skip); | |
1365 } | |
1366 if ($upto){ | |
1367 last if ($count > $upto); | |
1368 } | |
1369 | |
1370 $counting{sequences_count}++; | |
1371 if ($counting{sequences_count}%1000000==0) { | |
1372 warn "Processed $counting{sequences_count} sequence pairs so far\n"; | |
1373 } | |
1374 | |
1375 my $orig_identifier_1 = $identifier_1; | |
1376 my $orig_identifier_2 = $identifier_2; | |
1377 | |
1378 chomp $sequence_1; | |
1379 chomp $identifier_1; | |
1380 chomp $sequence_2; | |
1381 chomp $identifier_2; | |
1382 chomp $quality_value_1; | |
1383 chomp $quality_value_2; | |
1384 | |
1385 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID | |
1386 | |
1387 my $return; | |
1388 if ($bowtie2){ | |
1389 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); | |
1390 } | |
1391 else{ | |
1392 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); | |
1393 } | |
1394 | |
1395 unless ($return){ | |
1396 $return = 0; | |
1397 } | |
1398 | |
1399 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified | |
1400 if ($ambiguous and $return == 2){ | |
1401 # seq_1 | |
1402 print AMBIG_1 $orig_identifier_1; | |
1403 print AMBIG_1 "$sequence_1\n"; | |
1404 print AMBIG_1 $ident_1; | |
1405 print AMBIG_1 "$quality_value_1\n"; | |
1406 # seq_2 | |
1407 print AMBIG_2 $orig_identifier_2; | |
1408 print AMBIG_2 "$sequence_2\n"; | |
1409 print AMBIG_2 $ident_2; | |
1410 print AMBIG_2 "$quality_value_2\n"; | |
1411 } | |
1412 | |
1413 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified | |
1414 elsif ($unmapped and $return == 1){ | |
1415 # seq_1 | |
1416 print UNMAPPED_1 $orig_identifier_1; | |
1417 print UNMAPPED_1 "$sequence_1\n"; | |
1418 print UNMAPPED_1 $ident_1; | |
1419 print UNMAPPED_1 "$quality_value_1\n"; | |
1420 # seq_2 | |
1421 print UNMAPPED_2 $orig_identifier_2; | |
1422 print UNMAPPED_2 "$sequence_2\n"; | |
1423 print UNMAPPED_2 $ident_2; | |
1424 print UNMAPPED_2 "$quality_value_2\n"; | |
1425 } | |
1426 } | |
1427 | |
1428 warn "Processed $counting{sequences_count} sequences in total\n\n"; | |
1429 | |
1430 close OUT or die $!; | |
1431 | |
1432 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
1433 | |
1434 } | |
1435 | |
1436 sub check_bowtie_results_single_end{ | |
1437 my ($sequence,$identifier,$quality_value) = @_; | |
1438 | |
1439 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout | |
1440 $quality_value = 'I'x(length$sequence); | |
1441 } | |
1442 | |
1443 my %mismatches = (); | |
1444 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome | |
1445 foreach my $index (0..$#fhs){ | |
1446 | |
1447 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
1448 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); | |
1449 ### if the sequence we are currently looking at produced an alignment we are doing various things with it | |
1450 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
1451 ############################################################### | |
1452 ### STEP I Now processing the alignment stored in last_line ### | |
1453 ############################################################### | |
1454 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier); | |
1455 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation | |
1456 ### we only continue to extract useful information about this alignment if 1 was returned | |
1457 if ($valid_alignment_found_1 == 1){ | |
1458 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself | |
1459 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) | |
1460 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; | |
1461 | |
1462 unless($mismatch_info){ | |
1463 $mismatch_info = ''; | |
1464 } | |
1465 | |
1466 chomp $mismatch_info; | |
1467 my $chromosome; | |
1468 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ | |
1469 $chromosome = $mapped_chromosome; | |
1470 } | |
1471 else{ | |
1472 die "Chromosome number extraction failed for $mapped_chromosome\n"; | |
1473 } | |
1474 ### Now extracting the number of mismatches to the converted genome | |
1475 my $number_of_mismatches; | |
1476 if ($mismatch_info eq ''){ | |
1477 $number_of_mismatches = 0; | |
1478 } | |
1479 elsif ($mismatch_info =~ /^\d/){ | |
1480 my @mismatches = split (/,/,$mismatch_info); | |
1481 $number_of_mismatches = scalar @mismatches; | |
1482 } | |
1483 else{ | |
1484 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n"; | |
1485 } | |
1486 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
1487 my $alignment_location = join (":",$chromosome,$position); | |
1488 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
1489 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same | |
1490 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index | |
1491 ### number for the found alignment) | |
1492 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ | |
1493 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; | |
1494 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; | |
1495 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; | |
1496 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; | |
1497 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; | |
1498 } | |
1499 $number_of_mismatches = undef; | |
1500 ################################################################################################################################################## | |
1501 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a | |
1502 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will | |
1503 ### be returned as $valid_alignment_found and it will then be processed in the next round only. | |
1504 ################################################################################################################################################## | |
1505 my $newline = $fhs[$index]->{fh}-> getline(); | |
1506 if ($newline){ | |
1507 my ($seq_id) = split (/\t/,$newline); | |
1508 $fhs[$index]->{last_seq_id} = $seq_id; | |
1509 $fhs[$index]->{last_line} = $newline; | |
1510 } | |
1511 else { | |
1512 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) | |
1513 $fhs[$index]->{last_seq_id} = undef; | |
1514 $fhs[$index]->{last_line} = undef; | |
1515 next; | |
1516 } | |
1517 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier); | |
1518 ### we only continue to extract useful information about this second alignment if 1 was returned | |
1519 if ($valid_alignment_found_2 == 1){ | |
1520 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself | |
1521 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) | |
1522 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; | |
1523 unless($mismatch_info){ | |
1524 $mismatch_info = ''; | |
1525 } | |
1526 chomp $mismatch_info; | |
1527 | |
1528 my $chromosome; | |
1529 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ | |
1530 $chromosome = $mapped_chromosome; | |
1531 } | |
1532 else{ | |
1533 die "Chromosome number extraction failed for $mapped_chromosome\n"; | |
1534 } | |
1535 | |
1536 ### Now extracting the number of mismatches to the converted genome | |
1537 my $number_of_mismatches; | |
1538 if ($mismatch_info eq ''){ | |
1539 $number_of_mismatches = 0; | |
1540 } | |
1541 elsif ($mismatch_info =~ /^\d/){ | |
1542 my @mismatches = split (/,/,$mismatch_info); | |
1543 $number_of_mismatches = scalar @mismatches; | |
1544 } | |
1545 else{ | |
1546 die "Something weird is going on with the mismatch field\n"; | |
1547 } | |
1548 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
1549 ### extracting the chromosome number from the bowtie output (see above) | |
1550 my $alignment_location = join (":",$chromosome,$position); | |
1551 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position | |
1552 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this | |
1553 ### case we are not writing the same entry out a second time. | |
1554 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ | |
1555 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; | |
1556 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; | |
1557 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; | |
1558 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; | |
1559 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; | |
1560 } | |
1561 #################################################################################################################################### | |
1562 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ### | |
1563 #################################################################################################################################### | |
1564 $newline = $fhs[$index]->{fh}-> getline(); | |
1565 if ($newline){ | |
1566 my ($seq_id) = split (/\t/,$newline); | |
1567 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier); | |
1568 $fhs[$index]->{last_seq_id} = $seq_id; | |
1569 $fhs[$index]->{last_line} = $newline; | |
1570 next; | |
1571 } | |
1572 else { | |
1573 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) | |
1574 $fhs[$index]->{last_seq_id} = undef; | |
1575 $fhs[$index]->{last_line} = undef; | |
1576 next; | |
1577 } | |
1578 ### still within the 2nd sequence in correct orientation found | |
1579 } | |
1580 ### still withing the 1st sequence in correct orientation found | |
1581 } | |
1582 ### still within the if (last_seq_id eq identifier) condition | |
1583 } | |
1584 ### still within foreach index loop | |
1585 } | |
1586 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file | |
1587 unless(%mismatches){ | |
1588 $counting{no_single_alignment_found}++; | |
1589 if ($unmapped){ | |
1590 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified | |
1591 } | |
1592 else{ | |
1593 return; | |
1594 } | |
1595 } | |
1596 ####################################################################################################################################################### | |
1597 ####################################################################################################################################################### | |
1598 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### | |
1599 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### | |
1600 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ### | |
1601 ####################################################################################################################################################### | |
1602 ####################################################################################################################################################### | |
1603 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
1604 my $sequence_fails = 0; | |
1605 ### Declaring an empty hash reference which will store all information we need for the methylation call | |
1606 my $methylation_call_params; # hash reference! | |
1607 ### sorting in ascending order | |
1608 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ | |
1609 | |
1610 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment | |
1611 if (scalar keys %{$mismatches{$mismatch_number}} == 1){ | |
1612 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ | |
1613 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence}; | |
1614 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; | |
1615 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position}; | |
1616 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; | |
1617 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; | |
1618 } | |
1619 } | |
1620 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){ | |
1621 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and | |
1622 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a | |
1623 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome | |
1624 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite | |
1625 ### reaction. E.g. | |
1626 ### CAGTCACGCGCGCGCG will become | |
1627 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition. | |
1628 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave | |
1629 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!: | |
1630 ### G->A conversion: | |
1631 ### highly methylated: CAATCACACACACACA | |
1632 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce | |
1633 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the | |
1634 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted | |
1635 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts. | |
1636 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions | |
1637 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of | |
1638 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment. | |
1639 ### In the above example the number of transliterations required to transform the actual sequence | |
1640 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment) | |
1641 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments) | |
1642 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments | |
1643 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed | |
1644 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be | |
1645 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded. | |
1646 my @three_candidate_seqs; | |
1647 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){ | |
1648 my $transliterations_performed; | |
1649 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){ | |
1650 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT'); | |
1651 } | |
1652 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){ | |
1653 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA'); | |
1654 } | |
1655 else{ | |
1656 die "unexpected index number range $!\n"; | |
1657 } | |
1658 push @three_candidate_seqs,{ | |
1659 index =>$mismatches{$mismatch_number}->{$composite_location}->{index}, | |
1660 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence}, | |
1661 mismatch_number => $mismatch_number, | |
1662 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome}, | |
1663 position => $mismatches{$mismatch_number}->{$composite_location}->{position}, | |
1664 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id}, | |
1665 transliterations_performed => $transliterations_performed, | |
1666 }; | |
1667 } | |
1668 ### sorting in ascending order for the lowest number of transliterations performed | |
1669 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs; | |
1670 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed}; | |
1671 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed}; | |
1672 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed}; | |
1673 # print "$first_array_element\t$second_array_element\t$third_array_element\n"; | |
1674 if (($first_array_element*2) < $second_array_element){ | |
1675 $counting{low_complexity_alignments_overruled_count}++; | |
1676 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits | |
1677 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence}; | |
1678 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome}; | |
1679 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position}; | |
1680 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index}; | |
1681 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; | |
1682 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n"; | |
1683 } | |
1684 else{ | |
1685 $sequence_fails = 1; | |
1686 } | |
1687 } | |
1688 else{ | |
1689 $sequence_fails = 1; | |
1690 } | |
1691 ### after processing the alignment with the lowest number of mismatches we exit | |
1692 last; | |
1693 } | |
1694 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions | |
1695 if ($sequence_fails == 1){ | |
1696 $counting{unsuitable_sequence_count}++; | |
1697 if ($ambiguous){ | |
1698 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified | |
1699 } | |
1700 if ($unmapped){ | |
1701 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified | |
1702 } | |
1703 else{ | |
1704 return 0; # => exits to next sequence (default) | |
1705 } | |
1706 } | |
1707 | |
1708 ### --DIRECTIONAL | |
1709 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
1710 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
1711 if ($directional){ | |
1712 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ | |
1713 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
1714 $counting{alignments_rejected_count}++; | |
1715 return 0; | |
1716 } | |
1717 } | |
1718 | |
1719 ### If the sequence has not been rejected so far it will have a unique best alignment | |
1720 $counting{unique_best_alignment_count}++; | |
1721 if ($pbat){ | |
1722 extract_corresponding_genomic_sequence_single_end_pbat($identifier,$methylation_call_params); | |
1723 } | |
1724 else{ | |
1725 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params); | |
1726 } | |
1727 | |
1728 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call | |
1729 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ | |
1730 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; | |
1731 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
1732 return 0; | |
1733 } | |
1734 | |
1735 ### otherwise we are set to perform the actual methylation call | |
1736 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); | |
1737 | |
1738 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value); | |
1739 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out | |
1740 } | |
1741 | |
1742 sub check_bowtie_results_single_end_bowtie2{ | |
1743 my ($sequence,$identifier,$quality_value) = @_; | |
1744 | |
1745 | |
1746 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout | |
1747 $quality_value = 'I'x(length$sequence); | |
1748 } | |
1749 | |
1750 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name. | |
1751 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs | |
1752 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n"; | |
1753 | |
1754 my $alignment_ambiguous = 0; | |
1755 | |
1756 my %alignments = (); | |
1757 | |
1758 ### reading from the Bowtie 2 output filehandles | |
1759 foreach my $index (0..$#fhs){ | |
1760 # print "Index: $index\n"; | |
1761 # print "$fhs[$index]->{last_line}\n"; | |
1762 # print "$fhs[$index]->{last_seq_id}\n"; | |
1763 # sleep (1); | |
1764 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
1765 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); | |
1766 | |
1767 ### if the sequence we are currently looking at produced an alignment we are doing various things with it | |
1768 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n"; | |
1769 | |
1770 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
1771 # SAM format specifications for Bowtie 2 | |
1772 # (1) Name of read that aligned | |
1773 # (2) Sum of all applicable flags. Flags relevant to Bowtie are: | |
1774 # 1 The read is one of a pair | |
1775 # 2 The alignment is one end of a proper paired-end alignment | |
1776 # 4 The read has no reported alignments | |
1777 # 8 The read is one of a pair and has no reported alignments | |
1778 # 16 The alignment is to the reverse reference strand | |
1779 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand | |
1780 # 64 The read is mate 1 in a pair | |
1781 # 128 The read is mate 2 in a pair | |
1782 # 256 The read has multiple mapping states | |
1783 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) | |
1784 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) | |
1785 # (5) Mapping quality (255 means MAPQ is not available) | |
1786 # (6) CIGAR string representation of alignment (* if unavailable) | |
1787 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. | |
1788 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. | |
1789 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. | |
1790 # (10) Read sequence (reverse-complemented if aligned to the reverse strand) | |
1791 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. | |
1792 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: | |
1793 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. | |
1794 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. | |
1795 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. | |
1796 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. | |
1797 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read. | |
1798 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
1799 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
1800 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. | |
1801 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. | |
1802 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. | |
1803 | |
1804 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10]; | |
1805 | |
1806 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance | |
1807 if ($flag == 4){ | |
1808 ## reading in the next alignment, which must be the next sequence | |
1809 my $newline = $fhs[$index]->{fh}-> getline(); | |
1810 if ($newline){ | |
1811 chomp $newline; | |
1812 my ($seq_id) = split (/\t/,$newline); | |
1813 $fhs[$index]->{last_seq_id} = $seq_id; | |
1814 $fhs[$index]->{last_line} = $newline; | |
1815 if ($seq_id eq $identifier){ | |
1816 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; | |
1817 } | |
1818 next; # next instance | |
1819 } | |
1820 else{ | |
1821 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
1822 $fhs[$index]->{last_seq_id} = undef; | |
1823 $fhs[$index]->{last_line} = undef; | |
1824 next; | |
1825 } | |
1826 } | |
1827 | |
1828 # if there are one or more proper alignments we can extract the chromosome number | |
1829 my $chromosome; | |
1830 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ | |
1831 $chromosome = $mapped_chromosome; | |
1832 } | |
1833 else{ | |
1834 die "Chromosome number extraction failed for $mapped_chromosome\n"; | |
1835 } | |
1836 | |
1837 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string | |
1838 my ($alignment_score,$second_best,$MD_tag); | |
1839 my @fields = split (/\t/,$fhs[$index]->{last_line}); | |
1840 | |
1841 foreach (11..$#fields){ | |
1842 if ($fields[$_] =~ /AS:i:(.*)/){ | |
1843 $alignment_score = $1; | |
1844 } | |
1845 elsif ($fields[$_] =~ /XS:i:(.*)/){ | |
1846 $second_best = $1; | |
1847 } | |
1848 elsif ($fields[$_] =~ /MD:Z:(.*)/){ | |
1849 $MD_tag = $1; | |
1850 } | |
1851 } | |
1852 | |
1853 # warn "First best alignment_score is: '$alignment_score'\n"; | |
1854 # warn "MD tag is: '$MD_tag'\n"; | |
1855 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag); | |
1856 | |
1857 if (defined $second_best){ | |
1858 # warn "second best alignment_score is: '$second_best'\n\n"; | |
1859 | |
1860 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether | |
1861 if ($alignment_score == $second_best){ | |
1862 $alignment_ambiguous = 1; | |
1863 ## need to read and discard all additional ambiguous reads until we reach the next sequence | |
1864 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
1865 my $newline = $fhs[$index]->{fh}-> getline(); | |
1866 if ($newline){ | |
1867 chomp $newline; | |
1868 my ($seq_id) = split (/\t/,$newline); | |
1869 $fhs[$index]->{last_seq_id} = $seq_id; | |
1870 $fhs[$index]->{last_line} = $newline; | |
1871 } | |
1872 else{ | |
1873 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
1874 $fhs[$index]->{last_seq_id} = undef; | |
1875 $fhs[$index]->{last_line} = undef; | |
1876 last; # break free in case we have reached the end of the alignment output | |
1877 } | |
1878 } | |
1879 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; | |
1880 } | |
1881 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment | |
1882 | |
1883 my $alignment_location = join (":",$chromosome,$position); | |
1884 | |
1885 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
1886 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
1887 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
1888 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB | |
1889 | |
1890 unless (exists $alignments{$alignment_location}){ | |
1891 $alignments{$alignment_location}->{seq_id} = $id; | |
1892 $alignments{$alignment_location}->{alignment_score} = $alignment_score; | |
1893 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; | |
1894 $alignments{$alignment_location}->{index} = $index; | |
1895 $alignments{$alignment_location}->{chromosome} = $chromosome; | |
1896 $alignments{$alignment_location}->{position} = $position; | |
1897 $alignments{$alignment_location}->{CIGAR} = $cigar; | |
1898 $alignments{$alignment_location}->{MD_tag} = $MD_tag; | |
1899 } | |
1900 | |
1901 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence | |
1902 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
1903 my $newline = $fhs[$index]->{fh}-> getline(); | |
1904 if ($newline){ | |
1905 chomp $newline; | |
1906 my ($seq_id) = split (/\t/,$newline); | |
1907 $fhs[$index]->{last_seq_id} = $seq_id; | |
1908 $fhs[$index]->{last_line} = $newline; | |
1909 } | |
1910 else{ | |
1911 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
1912 $fhs[$index]->{last_seq_id} = undef; | |
1913 $fhs[$index]->{last_line} = undef; | |
1914 last; # break free in case we have reached the end of the alignment output | |
1915 } | |
1916 } | |
1917 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; | |
1918 } | |
1919 } | |
1920 else{ # there is no second best hit, so we can just store this one and read in the next sequence | |
1921 | |
1922 my $alignment_location = join (":",$chromosome,$position); | |
1923 | |
1924 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
1925 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
1926 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
1927 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB | |
1928 | |
1929 unless (exists $alignments{$alignment_location}){ | |
1930 $alignments{$alignment_location}->{seq_id} = $id; | |
1931 $alignments{$alignment_location}->{alignment_score} = $alignment_score; | |
1932 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; | |
1933 $alignments{$alignment_location}->{index} = $index; | |
1934 $alignments{$alignment_location}->{chromosome} = $chromosome; | |
1935 $alignments{$alignment_location}->{position} = $position; | |
1936 $alignments{$alignment_location}->{MD_tag} = $MD_tag; | |
1937 $alignments{$alignment_location}->{CIGAR} = $cigar; | |
1938 } | |
1939 | |
1940 my $newline = $fhs[$index]->{fh}-> getline(); | |
1941 if ($newline){ | |
1942 chomp $newline; | |
1943 my ($seq_id) = split (/\t/,$newline); | |
1944 $fhs[$index]->{last_seq_id} = $seq_id; | |
1945 $fhs[$index]->{last_line} = $newline; | |
1946 if ($seq_id eq $identifier){ | |
1947 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; | |
1948 } | |
1949 } | |
1950 else{ | |
1951 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
1952 $fhs[$index]->{last_seq_id} = undef; | |
1953 $fhs[$index]->{last_line} = undef; | |
1954 } | |
1955 } | |
1956 } | |
1957 } | |
1958 | |
1959 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out. | |
1960 if ($alignment_ambiguous == 1){ | |
1961 $counting{unsuitable_sequence_count}++; | |
1962 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
1963 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); | |
1964 # print "$ambiguous_read_output\n"; | |
1965 | |
1966 if ($ambiguous){ | |
1967 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified | |
1968 } | |
1969 elsif ($unmapped){ | |
1970 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified | |
1971 } | |
1972 else{ | |
1973 return 0; | |
1974 } | |
1975 } | |
1976 | |
1977 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file | |
1978 unless(%alignments){ | |
1979 $counting{no_single_alignment_found}++; | |
1980 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value); | |
1981 # print "$unmapped_read_output\n"; | |
1982 if ($unmapped){ | |
1983 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified | |
1984 } | |
1985 else{ | |
1986 return 0; # default | |
1987 } | |
1988 } | |
1989 | |
1990 ####################################################################################################################################################### | |
1991 | |
1992 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one | |
1993 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) | |
1994 ### alignment score we are discarding the sequence altogether. | |
1995 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for | |
1996 ### opening (5) and extending (3 per bp) the gap. | |
1997 | |
1998 ####################################################################################################################################################### | |
1999 | |
2000 my $methylation_call_params; # hash reference which will store all information we need for the methylation call | |
2001 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
2002 | |
2003 ### print contents of %alignments for debugging | |
2004 # if (scalar keys %alignments > 1){ | |
2005 # print "\n******\n"; | |
2006 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ | |
2007 # print "Loc: $alignment_location\n"; | |
2008 # print "ID: $alignments{$alignment_location}->{seq_id}\n"; | |
2009 # print "AS: $alignments{$alignment_location}->{alignment_score}\n"; | |
2010 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n"; | |
2011 # print "Index $alignments{$alignment_location}->{index}\n"; | |
2012 # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; | |
2013 # print "pos: $alignments{$alignment_location}->{position}\n"; | |
2014 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n"; | |
2015 # } | |
2016 # print "\n******\n"; | |
2017 # } | |
2018 | |
2019 ### if there is only 1 entry in the hash with we accept it as the best alignment | |
2020 if (scalar keys %alignments == 1){ | |
2021 for my $unique_best_alignment (keys %alignments){ | |
2022 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence}; | |
2023 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; | |
2024 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position}; | |
2025 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; | |
2026 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score}; | |
2027 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag}; | |
2028 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR}; | |
2029 } | |
2030 } | |
2031 | |
2032 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case | |
2033 ### we boot the sequence altogether | |
2034 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ | |
2035 my $best_alignment_score; | |
2036 my $best_alignment_location; | |
2037 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){ | |
2038 # print "$alignments{$alignment_location}->{alignment_score}\n"; | |
2039 unless (defined $best_alignment_score){ | |
2040 $best_alignment_score = $alignments{$alignment_location}->{alignment_score}; | |
2041 $best_alignment_location = $alignment_location; | |
2042 # print "setting best alignment score: $best_alignment_score\n"; | |
2043 } | |
2044 else{ | |
2045 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted | |
2046 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){ | |
2047 # warn "Same alignment score, the sequence will get booted!\n"; | |
2048 $sequence_fails = 1; | |
2049 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments | |
2050 } | |
2051 ### else we are going to store the best alignment for further processing | |
2052 else{ | |
2053 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence}; | |
2054 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; | |
2055 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position}; | |
2056 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; | |
2057 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score}; | |
2058 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag}; | |
2059 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR}; | |
2060 last; # exiting after processing the second alignment since the sequence produced a unique best alignment | |
2061 } | |
2062 } | |
2063 } | |
2064 } | |
2065 else{ | |
2066 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";; | |
2067 } | |
2068 | |
2069 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions | |
2070 if ($sequence_fails == 1){ | |
2071 $counting{unsuitable_sequence_count}++; | |
2072 | |
2073 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
2074 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); | |
2075 # print OUT "$ambiguous_read_output\n"; | |
2076 | |
2077 if ($ambiguous){ | |
2078 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified | |
2079 } | |
2080 elsif ($unmapped){ | |
2081 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified | |
2082 } | |
2083 else{ | |
2084 return 0; # => exits to next sequence (default) | |
2085 } | |
2086 } | |
2087 | |
2088 ### --DIRECTIONAL | |
2089 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
2090 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
2091 if ($directional){ | |
2092 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ | |
2093 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
2094 $counting{alignments_rejected_count}++; | |
2095 return 0; | |
2096 } | |
2097 } | |
2098 | |
2099 ### If the sequence has not been rejected so far it has a unique best alignment | |
2100 $counting{unique_best_alignment_count}++; | |
2101 | |
2102 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well | |
2103 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params); | |
2104 | |
2105 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call | |
2106 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ | |
2107 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; | |
2108 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
2109 return 0; | |
2110 } | |
2111 | |
2112 | |
2113 ### otherwise we are set to perform the actual methylation call | |
2114 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); | |
2115 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value); | |
2116 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out | |
2117 } | |
2118 | |
2119 | |
2120 sub determine_number_of_transliterations_performed{ | |
2121 my ($sequence,$read_conversion) = @_; | |
2122 my $number_of_transliterations; | |
2123 if ($read_conversion eq 'CT'){ | |
2124 $number_of_transliterations = $sequence =~ tr/C/T/; | |
2125 } | |
2126 elsif ($read_conversion eq 'GA'){ | |
2127 $number_of_transliterations = $sequence =~ tr/G/A/; | |
2128 } | |
2129 else{ | |
2130 die "Read conversion mode of the read was not specified $!\n"; | |
2131 } | |
2132 return $number_of_transliterations; | |
2133 } | |
2134 | |
2135 sub decide_whether_single_end_alignment_is_valid{ | |
2136 my ($index,$identifier) = @_; | |
2137 | |
2138 # extracting from Bowtie 1 format | |
2139 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1]; | |
2140 | |
2141 ### ensuring that the entry is the correct sequence | |
2142 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){ | |
2143 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically | |
2144 ### sensible alignments | |
2145 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); | |
2146 ### If the orientation was correct can we move on | |
2147 if ($orientation == 1){ | |
2148 return 1; ### 1st possibility for a sequence to pass | |
2149 } | |
2150 ### If the alignment was in the wrong orientation we need to read in a new line | |
2151 elsif($orientation == 0){ | |
2152 my $newline = $fhs[$index]->{fh}->getline(); | |
2153 if ($newline){ | |
2154 ($id,$strand) = (split (/\t/,$newline))[0,1]; | |
2155 | |
2156 ### ensuring that the next entry is still the correct sequence | |
2157 if ($id eq $identifier){ | |
2158 ### checking orientation again | |
2159 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); | |
2160 ### If the orientation was correct can we move on | |
2161 if ($orientation == 1){ | |
2162 $fhs[$index]->{last_seq_id} = $id; | |
2163 $fhs[$index]->{last_line} = $newline; | |
2164 return 1; ### 2nd possibility for a sequence to pass | |
2165 } | |
2166 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs | |
2167 elsif ($orientation == 0){ | |
2168 $newline = $fhs[$index]->{fh}->getline(); | |
2169 if ($newline){ | |
2170 my ($seq_id) = split (/\t/,$newline); | |
2171 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with | |
2172 ### the same fields of the just read next entry | |
2173 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier); | |
2174 $fhs[$index]->{last_seq_id} = $seq_id; | |
2175 $fhs[$index]->{last_line} = $newline; | |
2176 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation | |
2177 } | |
2178 else{ | |
2179 # assigning undef to last_seq_id and last_line (end of bowtie output) | |
2180 $fhs[$index]->{last_seq_id} = undef; | |
2181 $fhs[$index]->{last_line} = undef; | |
2182 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation | |
2183 } | |
2184 } | |
2185 else{ | |
2186 die "The orientation of the alignment must be either correct or incorrect\n"; | |
2187 } | |
2188 } | |
2189 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs | |
2190 else{ | |
2191 $fhs[$index]->{last_seq_id} = $id; | |
2192 $fhs[$index]->{last_line} = $newline; | |
2193 return 0; # processing the new alignment result only in the next round | |
2194 } | |
2195 } | |
2196 else { | |
2197 # assigning undef to last_seq_id and last_line (end of bowtie output) | |
2198 $fhs[$index]->{last_seq_id} = undef; | |
2199 $fhs[$index]->{last_line} = undef; | |
2200 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation | |
2201 } | |
2202 } | |
2203 else{ | |
2204 die "The orientation of the alignment must be either correct or incorrect\n"; | |
2205 } | |
2206 } | |
2207 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round | |
2208 else{ | |
2209 return 0; | |
2210 } | |
2211 } | |
2212 ######################### | |
2213 ### BOWTIE 1 | PAIRED-END | |
2214 ######################### | |
2215 | |
2216 sub check_bowtie_results_paired_ends{ | |
2217 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; | |
2218 | |
2219 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 | |
2220 unless ($quality_value_1){ | |
2221 $quality_value_1 = 'I'x(length$sequence_1); | |
2222 } | |
2223 unless ($quality_value_2){ | |
2224 $quality_value_2 = 'I'x(length$sequence_2); | |
2225 } | |
2226 | |
2227 # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; | |
2228 # sleep (1); | |
2229 my %mismatches = (); | |
2230 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome | |
2231 | |
2232 | |
2233 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. | |
2234 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). | |
2235 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) | |
2236 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary | |
2237 ### strands are not being reported by specifying --directional | |
2238 | |
2239 foreach my $index (0,3,1,2){ | |
2240 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
2241 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); | |
2242 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it | |
2243 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
2244 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n"; | |
2245 | |
2246 ################################################################################## | |
2247 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ### | |
2248 ################################################################################## | |
2249 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); | |
2250 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong | |
2251 ### orientation. We only continue to extract useful information about this alignment if 1 was returned | |
2252 if ($valid_alignment_found == 1){ | |
2253 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself. | |
2254 ### we store the useful information in %mismatches | |
2255 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; | |
2256 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; | |
2257 chomp $mismatch_info_1; | |
2258 chomp $mismatch_info_2; | |
2259 | |
2260 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted | |
2261 my ($chromosome_1,$chromosome_2); | |
2262 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ | |
2263 $chromosome_1 = $mapped_chromosome_1; | |
2264 } | |
2265 else{ | |
2266 die "Chromosome number extraction failed for $mapped_chromosome_1\n"; | |
2267 } | |
2268 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ | |
2269 $chromosome_2 = $mapped_chromosome_2; | |
2270 } | |
2271 else{ | |
2272 die "Chromosome number extraction failed for $mapped_chromosome_2\n"; | |
2273 } | |
2274 | |
2275 ### Now extracting the number of mismatches to the converted genome | |
2276 my $number_of_mismatches_1; | |
2277 my $number_of_mismatches_2; | |
2278 if ($mismatch_info_1 eq ''){ | |
2279 $number_of_mismatches_1 = 0; | |
2280 } | |
2281 elsif ($mismatch_info_1 =~ /^\d/){ | |
2282 my @mismatches = split (/,/,$mismatch_info_1); | |
2283 $number_of_mismatches_1 = scalar @mismatches; | |
2284 } | |
2285 else{ | |
2286 die "Something weird is going on with the mismatch field\n"; | |
2287 } | |
2288 if ($mismatch_info_2 eq ''){ | |
2289 $number_of_mismatches_2 = 0; | |
2290 } | |
2291 elsif ($mismatch_info_2 =~ /^\d/){ | |
2292 my @mismatches = split (/,/,$mismatch_info_2); | |
2293 $number_of_mismatches_2 = scalar @mismatches; | |
2294 } | |
2295 else{ | |
2296 die "Something weird is going on with the mismatch field\n"; | |
2297 } | |
2298 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments | |
2299 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; | |
2300 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
2301 die "Position 1 is higher than position 2" if ($position_1 > $position_2); | |
2302 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); | |
2303 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
2304 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
2305 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same | |
2306 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index | |
2307 ### number for the found alignment) | |
2308 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ | |
2309 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine | |
2310 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; | |
2311 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; | |
2312 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; | |
2313 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine | |
2314 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; | |
2315 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; | |
2316 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; | |
2317 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; | |
2318 } | |
2319 ################################################################################################################################################### | |
2320 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ### | |
2321 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ### | |
2322 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ### | |
2323 ### this round ### | |
2324 ################################################################################################################################################### | |
2325 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
2326 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
2327 | |
2328 if ($newline_1 and $newline_2){ | |
2329 my ($seq_id_1) = split (/\t/,$newline_1); | |
2330 my ($seq_id_2) = split (/\t/,$newline_2); | |
2331 | |
2332 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
2333 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
2334 } | |
2335 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
2336 $fhs[$index]->{last_seq_id} = $seq_id_2; | |
2337 } | |
2338 else{ | |
2339 die "Either read 1 or read 2 needs to end on '/1'\n"; | |
2340 } | |
2341 | |
2342 $fhs[$index]->{last_line_1} = $newline_1; | |
2343 $fhs[$index]->{last_line_2} = $newline_2; | |
2344 } | |
2345 else { | |
2346 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) | |
2347 $fhs[$index]->{last_seq_id} = undef; | |
2348 $fhs[$index]->{last_line_1} = undef; | |
2349 $fhs[$index]->{last_line_2} = undef; | |
2350 next; # jumping to the next index | |
2351 } | |
2352 ### Now processing the entry we just stored in last_line_1 and last_line_2 | |
2353 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); | |
2354 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to | |
2355 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation | |
2356 if ($valid_alignment_found == 1){ | |
2357 ### we store the useful information in %mismatches | |
2358 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7]; | |
2359 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7]; | |
2360 chomp $mismatch_info_1; | |
2361 chomp $mismatch_info_2; | |
2362 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted) | |
2363 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ | |
2364 $chromosome_1 = $mapped_chromosome_1; | |
2365 } | |
2366 else{ | |
2367 die "Chromosome number extraction failed for $mapped_chromosome_1\n"; | |
2368 } | |
2369 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ | |
2370 $chromosome_2 = $mapped_chromosome_2; | |
2371 } | |
2372 else{ | |
2373 die "Chromosome number extraction failed for $mapped_chromosome_2\n"; | |
2374 } | |
2375 | |
2376 $number_of_mismatches_1=''; | |
2377 $number_of_mismatches_2=''; | |
2378 ### Now extracting the number of mismatches to the converted genome | |
2379 if ($mismatch_info_1 eq ''){ | |
2380 $number_of_mismatches_1 = 0; | |
2381 } | |
2382 elsif ($mismatch_info_1 =~ /^\d/){ | |
2383 my @mismatches = split (/,/,$mismatch_info_1); | |
2384 $number_of_mismatches_1 = scalar @mismatches; | |
2385 } | |
2386 else{ | |
2387 die "Something weird is going on with the mismatch field\n"; | |
2388 } | |
2389 if ($mismatch_info_2 eq ''){ | |
2390 $number_of_mismatches_2 = 0; | |
2391 } | |
2392 elsif ($mismatch_info_2 =~ /^\d/){ | |
2393 my @mismatches = split (/,/,$mismatch_info_2); | |
2394 $number_of_mismatches_2 = scalar @mismatches; | |
2395 } | |
2396 else{ | |
2397 die "Something weird is going on with the mismatch field\n"; | |
2398 } | |
2399 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments | |
2400 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; | |
2401 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
2402 die "position 1 is greater than position 2" if ($position_1 > $position_2); | |
2403 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); | |
2404 $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
2405 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
2406 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same | |
2407 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index | |
2408 ### number for the found alignment) | |
2409 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ | |
2410 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine | |
2411 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; | |
2412 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; | |
2413 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; | |
2414 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine | |
2415 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; | |
2416 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; | |
2417 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; | |
2418 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; | |
2419 } | |
2420 ############################################################################################################################################### | |
2421 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ### | |
2422 ############################################################################################################################################### | |
2423 $newline_1 = $fhs[$index]->{fh}-> getline(); | |
2424 $newline_2 = $fhs[$index]->{fh}-> getline(); | |
2425 | |
2426 if ($newline_1 and $newline_2){ | |
2427 my ($seq_id_1) = split (/\t/,$newline_1); | |
2428 my ($seq_id_2) = split (/\t/,$newline_2); | |
2429 | |
2430 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
2431 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
2432 } | |
2433 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
2434 $fhs[$index]->{last_seq_id} = $seq_id_2; | |
2435 } | |
2436 $fhs[$index]->{last_line_1} = $newline_1; | |
2437 $fhs[$index]->{last_line_2} = $newline_2; | |
2438 } | |
2439 else { | |
2440 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) | |
2441 $fhs[$index]->{last_seq_id} = undef; | |
2442 $fhs[$index]->{last_line_1} = undef; | |
2443 $fhs[$index]->{last_line_2} = undef; | |
2444 next; # jumping to the next index | |
2445 } | |
2446 ### within the 2nd sequence pair alignment in correct orientation found | |
2447 } | |
2448 ### within the 1st sequence pair alignment in correct orientation found | |
2449 } | |
2450 ### still within the (last_seq_id eq identifier) condition | |
2451 } | |
2452 ### still within foreach index loop | |
2453 } | |
2454 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file | |
2455 unless(%mismatches){ | |
2456 $counting{no_single_alignment_found}++; | |
2457 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified | |
2458 } | |
2459 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
2460 my $sequence_pair_fails = 0; | |
2461 ### Declaring an empty hash reference which will store all information we need for the methylation call | |
2462 my $methylation_call_params; # hash reference! | |
2463 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the | |
2464 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the | |
2465 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether | |
2466 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ | |
2467 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n"; | |
2468 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){ | |
2469 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n"; | |
2470 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n"; | |
2471 } | |
2472 if (scalar keys %{$mismatches{$mismatch_number}} == 1){ | |
2473 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n"; | |
2474 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ | |
2475 $methylation_call_params->{$identifier}->{seq_id} = $identifier; | |
2476 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1}; | |
2477 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}; | |
2478 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; | |
2479 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1}; | |
2480 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}; | |
2481 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2})); | |
2482 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; | |
2483 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1}; | |
2484 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2}; | |
2485 } | |
2486 } | |
2487 else{ | |
2488 $sequence_pair_fails = 1; | |
2489 } | |
2490 ### after processing the alignment with the lowest number of mismatches we exit | |
2491 last; | |
2492 } | |
2493 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions | |
2494 if ($sequence_pair_fails == 1){ | |
2495 $counting{unsuitable_sequence_count}++; | |
2496 if ($ambiguous){ | |
2497 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified | |
2498 } | |
2499 if ($unmapped){ | |
2500 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified | |
2501 } | |
2502 else{ | |
2503 return 0; # => exits to next sequence (default) | |
2504 } | |
2505 } | |
2506 | |
2507 ### --DIRECTIONAL | |
2508 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
2509 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
2510 if ($directional){ | |
2511 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ | |
2512 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
2513 $counting{alignments_rejected_count}++; | |
2514 return 0; | |
2515 } | |
2516 } | |
2517 | |
2518 ### If the sequence has not been rejected so far it does have a unique best alignment | |
2519 $counting{unique_best_alignment_count}++; | |
2520 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params); | |
2521 | |
2522 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call | |
2523 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ | |
2524 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; | |
2525 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
2526 return 0; | |
2527 } | |
2528 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ | |
2529 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; | |
2530 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
2531 return 0; | |
2532 } | |
2533 | |
2534 ### otherwise we are set to perform the actual methylation call | |
2535 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); | |
2536 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); | |
2537 | |
2538 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); | |
2539 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 | |
2540 } | |
2541 | |
2542 ######################### | |
2543 ### BOWTIE 2 | PAIRED-END | |
2544 ######################### | |
2545 | |
2546 sub check_bowtie_results_paired_ends_bowtie2{ | |
2547 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; | |
2548 | |
2549 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 | |
2550 unless ($quality_value_1){ | |
2551 $quality_value_1 = 'I'x(length$sequence_1); | |
2552 } | |
2553 | |
2554 unless ($quality_value_2){ | |
2555 $quality_value_2 = 'I'x(length$sequence_2); | |
2556 } | |
2557 | |
2558 | |
2559 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; | |
2560 | |
2561 | |
2562 my %alignments; | |
2563 my $alignment_ambiguous = 0; | |
2564 | |
2565 ### reading from the Bowtie 2 output filehandles | |
2566 | |
2567 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. | |
2568 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). | |
2569 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) | |
2570 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary | |
2571 ### strands are not being reported when '--directional' is specified | |
2572 | |
2573 foreach my $index (0,3,1,2){ | |
2574 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
2575 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); | |
2576 | |
2577 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it | |
2578 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
2579 | |
2580 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10]; | |
2581 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10]; | |
2582 # print "Index: $index\t$fhs[$index]->{last_line_1}\n"; | |
2583 # print "Index: $index\t$fhs[$index]->{last_line_2}\n"; | |
2584 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n"; | |
2585 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n"; | |
2586 $id_1 =~ s/\/1$//; | |
2587 $id_2 =~ s/\/2$//; | |
2588 | |
2589 # SAM format specifications for Bowtie 2 | |
2590 # (1) Name of read that aligned | |
2591 # (2) Sum of all applicable flags. Flags relevant to Bowtie are: | |
2592 # 1 The read is one of a pair | |
2593 # 2 The alignment is one end of a proper paired-end alignment | |
2594 # 4 The read has no reported alignments | |
2595 # 8 The read is one of a pair and has no reported alignments | |
2596 # 16 The alignment is to the reverse reference strand | |
2597 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand | |
2598 # 64 The read is mate 1 in a pair | |
2599 # 128 The read is mate 2 in a pair | |
2600 # 256 The read has multiple mapping states | |
2601 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) | |
2602 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) | |
2603 # (5) Mapping quality (255 means MAPQ is not available) | |
2604 # (6) CIGAR string representation of alignment (* if unavailable) | |
2605 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. | |
2606 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. | |
2607 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. | |
2608 # (10) Read sequence (reverse-complemented if aligned to the reverse strand) | |
2609 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. | |
2610 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: | |
2611 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. | |
2612 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. | |
2613 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. | |
2614 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. | |
2615 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read. | |
2616 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
2617 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
2618 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. | |
2619 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. | |
2620 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. | |
2621 | |
2622 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128). | |
2623 ### We can store the next alignment and move on to the next Bowtie 2 instance | |
2624 if ($flag_1 == 77 and $flag_2 == 141){ | |
2625 ## reading in the next alignment, which must be the next sequence | |
2626 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
2627 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
2628 | |
2629 if ($newline_1 and $newline_2){ | |
2630 chomp $newline_1; | |
2631 chomp $newline_2; | |
2632 my ($seq_id_1) = split (/\t/,$newline_1); | |
2633 my ($seq_id_2) = split (/\t/,$newline_2); | |
2634 $seq_id_1 =~ s/\/1$//; | |
2635 $seq_id_2 =~ s/\/2$//; | |
2636 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
2637 $fhs[$index]->{last_line_1} = $newline_1; | |
2638 $fhs[$index]->{last_line_2} = $newline_2; | |
2639 | |
2640 # print "current sequence ($identifier) did not map, reading in next sequence\n"; | |
2641 # print "$index\t$fhs[$index]->{last_seq_id}\n"; | |
2642 # print "$index\t$fhs[$index]->{last_line_1}\n"; | |
2643 # print "$index\t$fhs[$index]->{last_line_2}\n"; | |
2644 next; # next instance | |
2645 } | |
2646 else{ | |
2647 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
2648 $fhs[$index]->{last_seq_id} = undef; | |
2649 $fhs[$index]->{last_line_1} = undef; | |
2650 $fhs[$index]->{last_line_2} = undef; | |
2651 next; | |
2652 } | |
2653 } | |
2654 | |
2655 ### If there are one or more proper alignments we can extract the chromosome number | |
2656 my ($chromosome_1,$chromosome_2); | |
2657 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ | |
2658 $chromosome_1 = $mapped_chromosome_1; | |
2659 } | |
2660 else{ | |
2661 die "Chromosome number extraction failed for $mapped_chromosome_1\n"; | |
2662 } | |
2663 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ | |
2664 $chromosome_2 = $mapped_chromosome_2; | |
2665 } | |
2666 else{ | |
2667 die "Chromosome number extraction failed for $mapped_chromosome_2\n"; | |
2668 } | |
2669 | |
2670 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); | |
2671 | |
2672 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string | |
2673 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2); | |
2674 | |
2675 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1}); | |
2676 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2}); | |
2677 | |
2678 foreach (11..$#fields_1){ | |
2679 if ($fields_1[$_] =~ /AS:i:(.*)/){ | |
2680 $alignment_score_1 = $1; | |
2681 } | |
2682 elsif ($fields_1[$_] =~ /XS:i:(.*)/){ | |
2683 $second_best_1 = $1; | |
2684 } | |
2685 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){ | |
2686 $MD_tag_1 = $1; | |
2687 } | |
2688 } | |
2689 | |
2690 foreach (11..$#fields_2){ | |
2691 if ($fields_2[$_] =~ /AS:i:(.*)/){ | |
2692 $alignment_score_2 = $1; | |
2693 } | |
2694 elsif ($fields_2[$_] =~ /XS:i:(.*)/){ | |
2695 $second_best_2 = $1; | |
2696 } | |
2697 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){ | |
2698 $MD_tag_2 = $1; | |
2699 } | |
2700 } | |
2701 | |
2702 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1); | |
2703 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2); | |
2704 | |
2705 # warn "First read 1 alignment score is: '$alignment_score_1'\n"; | |
2706 # warn "First read 2 alignment score is: '$alignment_score_2'\n"; | |
2707 # warn "MD tag 1 is: '$MD_tag_1'\n"; | |
2708 # warn "MD tag 2 is: '$MD_tag_2'\n"; | |
2709 | |
2710 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments | |
2711 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ; | |
2712 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n"; | |
2713 | |
2714 if (defined $second_best_1 and defined $second_best_2){ | |
2715 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2; | |
2716 # warn "Second best alignment_score_1 is: '$second_best_1'\n"; | |
2717 # warn "Second best alignment_score_2 is: '$second_best_2'\n"; | |
2718 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n"; | |
2719 | |
2720 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether | |
2721 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){ | |
2722 $alignment_ambiguous = 1; | |
2723 # print "This read will be chucked (AS==XS detected)!\n"; | |
2724 | |
2725 ## need to read and discard all additional ambiguous reads until we reach the next sequence | |
2726 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
2727 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
2728 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
2729 if ($newline_1 and $newline_2){ | |
2730 chomp $newline_1; | |
2731 chomp $newline_2; | |
2732 my ($seq_id_1) = split (/\t/,$newline_1); | |
2733 my ($seq_id_2) = split (/\t/,$newline_2); | |
2734 $seq_id_1 =~ s/\/1$//; | |
2735 $seq_id_2 =~ s/\/2$//; | |
2736 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; | |
2737 | |
2738 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
2739 $fhs[$index]->{last_line_1} = $newline_1; | |
2740 $fhs[$index]->{last_line_2} = $newline_2; | |
2741 } | |
2742 else{ | |
2743 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
2744 $fhs[$index]->{last_seq_id} = undef; | |
2745 $fhs[$index]->{last_line_1} = undef; | |
2746 $fhs[$index]->{last_line_2} = undef; | |
2747 last; # break free if the end of the alignment output was reached | |
2748 } | |
2749 } | |
2750 # if ($fhs[$index]->{last_seq_id}){ | |
2751 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; | |
2752 # } | |
2753 } | |
2754 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment | |
2755 | |
2756 my $alignment_location; | |
2757 if ($position_1 <= $position_2){ | |
2758 $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
2759 } | |
2760 elsif($position_2 < $position_1){ | |
2761 $alignment_location = join(":",$chromosome_1,$position_2,$position_1); | |
2762 } | |
2763 | |
2764 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
2765 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
2766 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
2767 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB | |
2768 | |
2769 unless (exists $alignments{$alignment_location}){ | |
2770 $alignments{$alignment_location}->{seq_id} = $id_1; | |
2771 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; | |
2772 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; | |
2773 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; | |
2774 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; | |
2775 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; | |
2776 $alignments{$alignment_location}->{index} = $index; | |
2777 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine | |
2778 $alignments{$alignment_location}->{position_1} = $position_1; | |
2779 $alignments{$alignment_location}->{position_2} = $position_2; | |
2780 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; | |
2781 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; | |
2782 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; | |
2783 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; | |
2784 $alignments{$alignment_location}->{flag_1} = $flag_1; | |
2785 $alignments{$alignment_location}->{flag_2} = $flag_2; | |
2786 } | |
2787 # warn "added best of several alignments to \%alignments hash\n"; | |
2788 | |
2789 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence | |
2790 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
2791 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
2792 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
2793 if ($newline_1 and $newline_2){ | |
2794 chomp $newline_1; | |
2795 chomp $newline_2; | |
2796 my ($seq_id_1) = split (/\t/,$newline_1); | |
2797 my ($seq_id_2) = split (/\t/,$newline_2); | |
2798 $seq_id_1 =~ s/\/1$//; | |
2799 $seq_id_2 =~ s/\/2$//; | |
2800 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; | |
2801 | |
2802 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
2803 $fhs[$index]->{last_line_1} = $newline_1; | |
2804 $fhs[$index]->{last_line_2} = $newline_2; | |
2805 } | |
2806 else{ | |
2807 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) | |
2808 $fhs[$index]->{last_seq_id} = undef; | |
2809 $fhs[$index]->{last_line_1} = undef; | |
2810 $fhs[$index]->{last_line_2} = undef; | |
2811 last; # break free if the end of the alignment output was reached | |
2812 } | |
2813 } | |
2814 # if($fhs[$index]->{last_seq_id}){ | |
2815 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n"; | |
2816 # } | |
2817 } | |
2818 } | |
2819 else{ # there is no second best hit, so we can just store this one and read in the next sequence | |
2820 | |
2821 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
2822 # print "$alignment_location\n"; | |
2823 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
2824 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
2825 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
2826 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB | |
2827 | |
2828 unless (exists $alignments{$alignment_location}){ | |
2829 $alignments{$alignment_location}->{seq_id} = $id_1; | |
2830 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; | |
2831 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; | |
2832 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; | |
2833 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; | |
2834 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; | |
2835 $alignments{$alignment_location}->{index} = $index; | |
2836 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine | |
2837 $alignments{$alignment_location}->{position_1} = $position_1; | |
2838 $alignments{$alignment_location}->{position_2} = $position_2; | |
2839 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; | |
2840 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; | |
2841 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; | |
2842 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; | |
2843 $alignments{$alignment_location}->{flag_1} = $flag_1; | |
2844 $alignments{$alignment_location}->{flag_2} = $flag_2; | |
2845 } | |
2846 | |
2847 # warn "added unique alignment to \%alignments hash\n"; | |
2848 | |
2849 # Now reading and storing the next read pair | |
2850 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
2851 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
2852 if ($newline_1 and $newline_2){ | |
2853 chomp $newline_1; | |
2854 chomp $newline_2; | |
2855 # print "$newline_1\n"; | |
2856 # print "$newline_2\n"; | |
2857 my ($seq_id_1) = split (/\t/,$newline_1); | |
2858 my ($seq_id_2) = split (/\t/,$newline_2); | |
2859 $seq_id_1 =~ s/\/1$//; | |
2860 $seq_id_2 =~ s/\/2$//; | |
2861 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; | |
2862 | |
2863 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
2864 $fhs[$index]->{last_line_1} = $newline_1; | |
2865 $fhs[$index]->{last_line_2} = $newline_2; | |
2866 | |
2867 if ($seq_id_1 eq $identifier){ | |
2868 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; | |
2869 } | |
2870 } | |
2871 else{ | |
2872 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) | |
2873 $fhs[$index]->{last_seq_id} = undef; | |
2874 $fhs[$index]->{last_line_1} = undef; | |
2875 $fhs[$index]->{last_line_2} = undef; | |
2876 } | |
2877 } | |
2878 } | |
2879 } | |
2880 | |
2881 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format | |
2882 if ($alignment_ambiguous == 1){ | |
2883 $counting{unsuitable_sequence_count}++; | |
2884 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
2885 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); | |
2886 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); | |
2887 # print "$ambiguous_read_1\n"; | |
2888 # print "$ambiguous_read_2\n"; | |
2889 | |
2890 if ($ambiguous){ | |
2891 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified | |
2892 } | |
2893 elsif ($unmapped){ | |
2894 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified | |
2895 } | |
2896 else{ | |
2897 return 0; | |
2898 } | |
2899 } | |
2900 | |
2901 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file | |
2902 unless (%alignments){ | |
2903 $counting{no_single_alignment_found}++; | |
2904 | |
2905 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); | |
2906 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); | |
2907 # print "$unmapped_read_1\n"; | |
2908 # print "$unmapped_read_2\n"; | |
2909 if ($unmapped){ | |
2910 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified | |
2911 } | |
2912 else{ | |
2913 return 0; | |
2914 } | |
2915 } | |
2916 | |
2917 ####################################################################################################################################################### | |
2918 | |
2919 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one | |
2920 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) | |
2921 ### alignment score we are discarding the sequence pair altogether. | |
2922 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5) | |
2923 ### and extending (3 per bp) the gap. | |
2924 | |
2925 ####################################################################################################################################################### | |
2926 | |
2927 ### Declaring an empty hash reference which will store all information we need for the methylation call | |
2928 my $methylation_call_params; # hash reference | |
2929 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
2930 | |
2931 ### print contents of %alignments for debugging | |
2932 ## if (scalar keys %alignments >= 1){ | |
2933 # print "\n******\n"; | |
2934 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ | |
2935 # print "Loc: $alignment_location\n"; | |
2936 # print "ID: $alignments{$alignment_location}->{seq_id}\n"; | |
2937 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n"; | |
2938 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n"; | |
2939 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n"; | |
2940 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n"; | |
2941 # print "Index $alignments{$alignment_location}->{index}\n"; | |
2942 # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; | |
2943 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n"; | |
2944 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n"; | |
2945 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n"; | |
2946 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n"; | |
2947 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n"; | |
2948 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n"; | |
2949 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n"; | |
2950 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n"; | |
2951 # } | |
2952 # print "\n******\n"; | |
2953 # } | |
2954 | |
2955 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment | |
2956 if (scalar keys %alignments == 1){ | |
2957 for my $unique_best_alignment (keys %alignments){ | |
2958 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1}; | |
2959 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2}; | |
2960 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; | |
2961 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1}; | |
2962 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2}; | |
2963 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; | |
2964 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1}; | |
2965 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2}; | |
2966 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores}; | |
2967 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1}; | |
2968 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2}; | |
2969 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1}; | |
2970 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2}; | |
2971 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1}; | |
2972 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2}; | |
2973 } | |
2974 } | |
2975 | |
2976 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case | |
2977 ### we boot the sequence pair altogether) | |
2978 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ | |
2979 my $best_sum_of_alignment_scores; | |
2980 my $best_alignment_location; | |
2981 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){ | |
2982 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; | |
2983 unless (defined $best_sum_of_alignment_scores){ | |
2984 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores}; | |
2985 $best_alignment_location = $alignment_location; | |
2986 # print "setting best alignment score to: $best_sum_of_alignment_scores\n"; | |
2987 } | |
2988 else{ | |
2989 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted | |
2990 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){ | |
2991 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n"; | |
2992 $sequence_pair_fails = 1; | |
2993 last; # exiting since we know that the sequence has ambiguous alignments | |
2994 } | |
2995 ### else we are going to store the best alignment for further processing | |
2996 else{ | |
2997 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1}; | |
2998 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2}; | |
2999 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; | |
3000 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1}; | |
3001 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2}; | |
3002 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; | |
3003 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1}; | |
3004 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2}; | |
3005 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores}; | |
3006 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1}; | |
3007 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2}; | |
3008 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1}; | |
3009 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2}; | |
3010 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1}; | |
3011 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2}; | |
3012 last; # exiting since the sequence produced a unique best alignment | |
3013 } | |
3014 } | |
3015 } | |
3016 } | |
3017 else{ | |
3018 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";; | |
3019 } | |
3020 | |
3021 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions | |
3022 if ($sequence_pair_fails == 1){ | |
3023 $counting{unsuitable_sequence_count}++; | |
3024 | |
3025 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
3026 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); | |
3027 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); | |
3028 # print "$ambiguous_read_1\n"; | |
3029 # print "$ambiguous_read_2\n"; | |
3030 | |
3031 if ($ambiguous){ | |
3032 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified | |
3033 } | |
3034 elsif ($unmapped){ | |
3035 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified | |
3036 } | |
3037 else{ | |
3038 return 0; # => exits to next sequence pair (default) | |
3039 } | |
3040 } | |
3041 | |
3042 ### --DIRECTIONAL | |
3043 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
3044 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
3045 if ($directional){ | |
3046 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ | |
3047 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
3048 $counting{alignments_rejected_count}++; | |
3049 return 0; | |
3050 } | |
3051 } | |
3052 | |
3053 ### If the sequence pair has not been rejected so far it does have a unique best alignment | |
3054 $counting{unique_best_alignment_count}++; | |
3055 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); | |
3056 | |
3057 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call | |
3058 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ | |
3059 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; | |
3060 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
3061 return 0; | |
3062 } | |
3063 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ | |
3064 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; | |
3065 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
3066 return 0; | |
3067 } | |
3068 | |
3069 ### now we are set to perform the actual methylation call | |
3070 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); | |
3071 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); | |
3072 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n"; | |
3073 # print " $sequence_2\n"; | |
3074 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n"; | |
3075 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n"; | |
3076 | |
3077 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); | |
3078 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 | |
3079 } | |
3080 | |
3081 ### | |
3082 | |
3083 sub decide_whether_paired_end_alignment_is_valid{ | |
3084 my ($index,$identifier) = @_; | |
3085 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; | |
3086 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; | |
3087 chomp $mismatch_info_1; | |
3088 chomp $mismatch_info_2; | |
3089 my $seq_id_1 = $id_1; | |
3090 my $seq_id_2 = $id_2; | |
3091 $seq_id_1 =~ s/\/1$//; # removing the read /1 | |
3092 $seq_id_2 =~ s/\/1$//; # removing the read /1 | |
3093 | |
3094 ### ensuring that the current entry is the correct sequence | |
3095 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ | |
3096 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically | |
3097 ### sensible alignments | |
3098 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); | |
3099 ### If the orientation was correct can we move on | |
3100 if ($orientation == 1){ | |
3101 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS | |
3102 } | |
3103 ### If the alignment was in the wrong orientation we need to read in two new lines | |
3104 elsif($orientation == 0){ | |
3105 my $newline_1 = $fhs[$index]->{fh}->getline(); | |
3106 my $newline_2 = $fhs[$index]->{fh}->getline(); | |
3107 if ($newline_1 and $newline_2){ | |
3108 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time) | |
3109 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1]; | |
3110 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1]; | |
3111 | |
3112 my $seqid; | |
3113 $seq_id_1 = $id_1; | |
3114 $seq_id_2 = $id_2; | |
3115 # we need to capture the first read (ending on /1) | |
3116 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
3117 $seqid = $seq_id_1; | |
3118 } | |
3119 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
3120 $seqid = $seq_id_2; | |
3121 } | |
3122 else{ | |
3123 die "One of the two reads needs to end on /1!!"; | |
3124 } | |
3125 | |
3126 ### ensuring that the next entry is still the correct sequence | |
3127 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ | |
3128 ### checking orientation again | |
3129 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); | |
3130 ### If the orientation was correct can we move on | |
3131 if ($orientation == 1){ | |
3132 ### Writing the current sequence to last_line_1 and last_line_2 | |
3133 $fhs[$index]->{last_seq_id} = $seqid; | |
3134 $fhs[$index]->{last_line_1} = $newline_1; | |
3135 $fhs[$index]->{last_line_2} = $newline_2; | |
3136 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS | |
3137 } | |
3138 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be | |
3139 ### the next entry) | |
3140 elsif ($orientation == 0){ | |
3141 $newline_1 = $fhs[$index]->{fh}->getline(); | |
3142 $newline_2 = $fhs[$index]->{fh}->getline(); | |
3143 if ($newline_1 and $newline_2){ | |
3144 ($seq_id_1) = split (/\t/,$newline_1); | |
3145 ($seq_id_2) = split (/\t/,$newline_2); | |
3146 | |
3147 $seqid = ''; | |
3148 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
3149 $seqid = $seq_id_1; | |
3150 } | |
3151 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
3152 $seqid = $seq_id_2; | |
3153 } | |
3154 else{ | |
3155 die "One of the two reads needs to end on /1!!"; | |
3156 } | |
3157 | |
3158 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with | |
3159 ### the same fields of the just read next entry | |
3160 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier); | |
3161 $fhs[$index]->{last_seq_id} = $seqid; | |
3162 $fhs[$index]->{last_line_1} = $newline_1; | |
3163 $fhs[$index]->{last_line_2} = $newline_2; | |
3164 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation | |
3165 } | |
3166 else { | |
3167 ### assigning undef to last_seq_id and last_line (end of bowtie output) | |
3168 $fhs[$index]->{last_seq_id} = undef; | |
3169 $fhs[$index]->{last_line_1} = undef; | |
3170 $fhs[$index]->{last_line_2} = undef; | |
3171 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation | |
3172 } | |
3173 } | |
3174 else{ | |
3175 die "The orientation of the alignment must be either correct or incorrect\n"; | |
3176 } | |
3177 } | |
3178 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs | |
3179 else{ | |
3180 $fhs[$index]->{last_seq_id} = $seqid; | |
3181 $fhs[$index]->{last_line_1} = $newline_1; | |
3182 $fhs[$index]->{last_line_2} = $newline_2; | |
3183 return 0; # processing the new alignment result only in the next round | |
3184 } | |
3185 } | |
3186 else { | |
3187 # assigning undef to last_seq_id and both last_lines (end of bowtie output) | |
3188 $fhs[$index]->{last_seq_id} = undef; | |
3189 $fhs[$index]->{last_line_1} = undef; | |
3190 $fhs[$index]->{last_line_2} = undef; | |
3191 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation | |
3192 } | |
3193 } | |
3194 else{ | |
3195 die "The orientation of the alignment must be either correct or incorrect\n"; | |
3196 } | |
3197 } | |
3198 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round | |
3199 else{ | |
3200 return 0; | |
3201 } | |
3202 } | |
3203 | |
3204 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END | |
3205 | |
3206 sub extract_corresponding_genomic_sequence_paired_ends { | |
3207 my ($sequence_identifier,$methylation_call_params) = @_; | |
3208 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the | |
3209 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
3210 my $alignment_read_1; | |
3211 my $alignment_read_2; | |
3212 my $read_conversion_info_1; | |
3213 my $read_conversion_info_2; | |
3214 my $genome_conversion; | |
3215 | |
3216 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call | |
3217 ### if the C happens to be at the first or last position of the actually observed sequence | |
3218 my $non_bisulfite_sequence_1; | |
3219 my $non_bisulfite_sequence_2; | |
3220 | |
3221 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was | |
3222 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic | |
3223 ### sequences around! | |
3224 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) | |
3225 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ | |
3226 ### [Index 0, sequence originated from (converted) forward strand] | |
3227 $counting{CT_GA_CT_count}++; | |
3228 $alignment_read_1 = '+'; | |
3229 $alignment_read_2 = '-'; | |
3230 $read_conversion_info_1 = 'CT'; | |
3231 $read_conversion_info_2 = 'GA'; | |
3232 $genome_conversion = 'CT'; | |
3233 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) | |
3234 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end | |
3235 | |
3236 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change | |
3237 | |
3238 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) | |
3239 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation | |
3240 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1 | |
3241 | |
3242 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); | |
3243 ### the reverse strand sequence needs to be reverse complemented | |
3244 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
3245 } | |
3246 else{ | |
3247 $non_bisulfite_sequence_2 = ''; | |
3248 } | |
3249 } | |
3250 | |
3251 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) | |
3252 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ | |
3253 ### [Index 1, sequence originated from complementary to (converted) reverse strand] | |
3254 $counting{GA_CT_GA_count}++; | |
3255 $alignment_read_1 = '+'; | |
3256 $alignment_read_2 = '-'; | |
3257 $read_conversion_info_1 = 'GA'; | |
3258 $read_conversion_info_2 = 'CT'; | |
3259 $genome_conversion = 'GA'; | |
3260 | |
3261 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) | |
3262 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end | |
3263 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1 | |
3264 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2 | |
3265 } | |
3266 else{ | |
3267 $non_bisulfite_sequence_1 = ''; | |
3268 } | |
3269 | |
3270 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) | |
3271 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation | |
3272 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2 | |
3273 ### the reverse strand sequence needs to be reverse complemented | |
3274 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
3275 } | |
3276 | |
3277 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) | |
3278 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ | |
3279 ### [Index 2, sequence originated from the complementary to (converted) forward strand] | |
3280 $counting{GA_CT_CT_count}++; | |
3281 $alignment_read_1 = '-'; | |
3282 $alignment_read_2 = '+'; | |
3283 $read_conversion_info_1 = 'GA'; | |
3284 $read_conversion_info_2 = 'CT'; | |
3285 $genome_conversion = 'CT'; | |
3286 | |
3287 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! | |
3288 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand | |
3289 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation | |
3290 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2 | |
3291 ### the reverse strand sequence needs to be reverse complemented | |
3292 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
3293 | |
3294 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) | |
3295 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! | |
3296 ### Read 2 is CT converted so we need to capture 2 extra 3' bases | |
3297 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1 | |
3298 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2 | |
3299 } | |
3300 else{ | |
3301 $non_bisulfite_sequence_2 = ''; | |
3302 } | |
3303 } | |
3304 | |
3305 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) | |
3306 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ | |
3307 ### [Index 3, sequence originated from the (converted) reverse strand] | |
3308 $counting{CT_GA_GA_count}++; | |
3309 $alignment_read_1 = '-'; | |
3310 $alignment_read_2 = '+'; | |
3311 $read_conversion_info_1 = 'CT'; | |
3312 $read_conversion_info_2 = 'GA'; | |
3313 $genome_conversion = 'GA'; | |
3314 | |
3315 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! | |
3316 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand | |
3317 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation | |
3318 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1 | |
3319 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2 | |
3320 ### the reverse strand sequence needs to be reverse complemented | |
3321 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
3322 } | |
3323 else{ | |
3324 $non_bisulfite_sequence_1 = ''; | |
3325 } | |
3326 | |
3327 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) | |
3328 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! | |
3329 ### Read 2 is GA converted so we need to capture 2 extra 5' bases | |
3330 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2 | |
3331 } | |
3332 else{ | |
3333 die "Too many bowtie result filehandles\n"; | |
3334 } | |
3335 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
3336 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
3337 | |
3338 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; | |
3339 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; | |
3340 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
3341 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; | |
3342 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; | |
3343 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
3344 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
3345 } | |
3346 | |
3347 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END | |
3348 | |
3349 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{ | |
3350 my ($sequence_identifier,$methylation_call_params) = @_; | |
3351 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the | |
3352 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
3353 | |
3354 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; | |
3355 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; | |
3356 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; | |
3357 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; | |
3358 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; | |
3359 # sleep(10); | |
3360 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and | |
3361 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence | |
3362 | |
3363 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
3364 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
3365 my $alignment_read_1; | |
3366 my $alignment_read_2; | |
3367 my $read_conversion_info_1; | |
3368 my $read_conversion_info_2; | |
3369 my $genome_conversion; | |
3370 | |
3371 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call | |
3372 ### if the C happens to be at the last position of the actually observed sequence | |
3373 my $non_bisulfite_sequence_1 = ''; | |
3374 my $non_bisulfite_sequence_2 = ''; | |
3375 | |
3376 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings | |
3377 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1; | |
3378 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1; | |
3379 | |
3380 # parsing CIGAR 1 string | |
3381 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation | |
3382 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation | |
3383 shift @ops_1; # remove the empty first element | |
3384 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); | |
3385 # parsing CIGAR 2 string | |
3386 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation | |
3387 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation | |
3388 shift @ops_2; # remove the empty first element | |
3389 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); | |
3390 | |
3391 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output | |
3392 my $indels_2 = 0; | |
3393 | |
3394 ### Extracting read 1 genomic sequence ### | |
3395 | |
3396 # extracting 2 additional bp at the 5' end (read 1) | |
3397 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ | |
3398 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3399 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise | |
3400 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
3401 return; | |
3402 } | |
3403 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2); | |
3404 } | |
3405 | |
3406 foreach (0..$#len_1){ | |
3407 if ($ops_1[$_] eq 'M'){ | |
3408 # extracting genomic sequence | |
3409 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); | |
3410 # warn "$non_bisulfite_sequence_1\n"; | |
3411 # adjusting position | |
3412 $pos_1 += $len_1[$_]; | |
3413 } | |
3414 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence | |
3415 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls | |
3416 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_]; | |
3417 # warn "$non_bisulfite_sequence_1\n"; | |
3418 # position doesn't need adjusting | |
3419 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
3420 } | |
3421 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence | |
3422 # we do not add any genomic sequence but only adjust the position | |
3423 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n"; | |
3424 $pos_1 += $len_1[$_]; | |
3425 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
3426 } | |
3427 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die | |
3428 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
3429 } | |
3430 else{ | |
3431 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
3432 } | |
3433 } | |
3434 | |
3435 ### 3' end of read 1 | |
3436 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ | |
3437 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3438 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise | |
3439 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
3440 return; | |
3441 } | |
3442 | |
3443 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); | |
3444 } | |
3445 | |
3446 | |
3447 ### Extracting read 2 genomic sequence ### | |
3448 | |
3449 ### 5' end of read 2 | |
3450 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ | |
3451 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3452 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise | |
3453 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
3454 return; | |
3455 } | |
3456 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2); | |
3457 } | |
3458 | |
3459 foreach (0..$#len_2){ | |
3460 if ($ops_2[$_] eq 'M'){ | |
3461 # extracting genomic sequence | |
3462 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); | |
3463 # warn "$non_bisulfite_sequence_2\n"; | |
3464 # adjusting position | |
3465 $pos_2 += $len_2[$_]; | |
3466 } | |
3467 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence | |
3468 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls | |
3469 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_]; | |
3470 # warn "$non_bisulfite_sequence_2\n"; | |
3471 # position doesn't need adjusting | |
3472 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
3473 } | |
3474 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence | |
3475 # we do not add any genomic sequence but only adjust the position | |
3476 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n"; | |
3477 $pos_2 += $len_2[$_]; | |
3478 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
3479 } | |
3480 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die | |
3481 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
3482 } | |
3483 else{ | |
3484 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
3485 } | |
3486 } | |
3487 | |
3488 ### 3' end of read 2 | |
3489 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ | |
3490 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3491 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise | |
3492 # need to set read 1 as well now to prevent warning | |
3493 # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; | |
3494 # sleep(5); | |
3495 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
3496 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
3497 return; | |
3498 } | |
3499 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); | |
3500 } | |
3501 | |
3502 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was | |
3503 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly | |
3504 | |
3505 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) | |
3506 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ | |
3507 ### [Index 0, sequence originated from (converted) forward strand] | |
3508 $counting{CT_GA_CT_count}++; | |
3509 $alignment_read_1 = '+'; | |
3510 $alignment_read_2 = '-'; | |
3511 $read_conversion_info_1 = 'CT'; | |
3512 $read_conversion_info_2 = 'GA'; | |
3513 $genome_conversion = 'CT'; | |
3514 ### Read 1 is always the forward hit | |
3515 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented | |
3516 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
3517 } | |
3518 | |
3519 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) | |
3520 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ | |
3521 ### [Index 1, sequence originated from complementary to (converted) bottom strand] | |
3522 $counting{GA_CT_GA_count}++; | |
3523 $alignment_read_1 = '+'; | |
3524 $alignment_read_2 = '-'; | |
3525 $read_conversion_info_1 = 'GA'; | |
3526 $read_conversion_info_2 = 'CT'; | |
3527 $genome_conversion = 'GA'; | |
3528 ### Read 1 is always the forward hit | |
3529 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented | |
3530 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
3531 } | |
3532 | |
3533 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) | |
3534 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ | |
3535 ### [Index 2, sequence originated from the complementary to (converted) top strand] | |
3536 $counting{GA_CT_CT_count}++; | |
3537 $alignment_read_1 = '-'; | |
3538 $alignment_read_2 = '+'; | |
3539 $read_conversion_info_1 = 'GA'; | |
3540 $read_conversion_info_2 = 'CT'; | |
3541 $genome_conversion = 'CT'; | |
3542 | |
3543 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented | |
3544 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
3545 } | |
3546 | |
3547 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) | |
3548 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ | |
3549 ### [Index 3, sequence originated from the (converted) reverse strand] | |
3550 $counting{CT_GA_GA_count}++; | |
3551 $alignment_read_1 = '-'; | |
3552 $alignment_read_2 = '+'; | |
3553 $read_conversion_info_1 = 'CT'; | |
3554 $read_conversion_info_2 = 'GA'; | |
3555 $genome_conversion = 'GA'; | |
3556 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented | |
3557 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
3558 } | |
3559 else{ | |
3560 die "Too many bowtie result filehandles\n"; | |
3561 } | |
3562 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
3563 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
3564 | |
3565 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; | |
3566 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; | |
3567 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
3568 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; | |
3569 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; | |
3570 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
3571 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
3572 ## the end position of a read is stored in $pos | |
3573 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1; | |
3574 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2; | |
3575 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1; | |
3576 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2; | |
3577 } | |
3578 | |
3579 ########################################## | |
3580 ### PRINT SINGLE END RESULTS: Bowtie 1 ### | |
3581 ########################################## | |
3582 | |
3583 sub print_bisulfite_mapping_result_single_end{ | |
3584 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; | |
3585 | |
3586 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
3587 if ($phred64){ | |
3588 $quality_value = convert_phred64_quals_to_phred33($quality_value); | |
3589 } | |
3590 elsif ($solexa){ | |
3591 $quality_value = convert_solexa_quals_to_phred33($quality_value); | |
3592 } | |
3593 | |
3594 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position. | |
3595 $methylation_call_params->{$identifier}->{position} += 1; | |
3596 | |
3597 ### writing every uniquely mapped read and its methylation call to the output file | |
3598 if ($vanilla){ | |
3599 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value); | |
3600 print OUT "$bowtie1_output\n"; | |
3601 } | |
3602 else{ # SAM output, default since Bismark v1.0.0 | |
3603 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script | |
3604 } | |
3605 } | |
3606 | |
3607 ########################################## | |
3608 ### PRINT SINGLE END RESULTS: Bowtie 2 ### | |
3609 ########################################## | |
3610 | |
3611 sub print_bisulfite_mapping_result_single_end_bowtie2{ | |
3612 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; | |
3613 | |
3614 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
3615 if ($phred64){ | |
3616 $quality_value = convert_phred64_quals_to_phred33($quality_value); | |
3617 } | |
3618 elsif ($solexa){ | |
3619 $quality_value = convert_solexa_quals_to_phred33($quality_value); | |
3620 } | |
3621 | |
3622 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed) | |
3623 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script | |
3624 } | |
3625 | |
3626 ########################################## | |
3627 ### PRINT PAIRED END ESULTS: Bowtie 1 ### | |
3628 ########################################## | |
3629 | |
3630 sub print_bisulfite_mapping_results_paired_ends{ | |
3631 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; | |
3632 | |
3633 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
3634 if ($phred64){ | |
3635 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); | |
3636 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); | |
3637 } | |
3638 elsif ($solexa){ | |
3639 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); | |
3640 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); | |
3641 } | |
3642 | |
3643 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based) | |
3644 $methylation_call_params->{$identifier}->{start_seq_1} += 1; | |
3645 | |
3646 ### writing every single aligned read and its methylation call to the output file | |
3647 if ($vanilla){ | |
3648 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2); | |
3649 print OUT "$bowtie1_output_paired_end\n"; | |
3650 } | |
3651 else{ # SAM output, default since Bismark v1.0.0 | |
3652 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script | |
3653 } | |
3654 | |
3655 } | |
3656 | |
3657 ########################################## | |
3658 ### PRINT PAIRED END ESULTS: Bowtie 2 ### | |
3659 ########################################## | |
3660 | |
3661 sub print_bisulfite_mapping_results_paired_ends_bowtie2{ | |
3662 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; | |
3663 | |
3664 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
3665 if ($phred64){ | |
3666 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); | |
3667 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); | |
3668 } | |
3669 elsif ($solexa){ | |
3670 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); | |
3671 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); | |
3672 } | |
3673 | |
3674 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed) | |
3675 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script | |
3676 | |
3677 } | |
3678 | |
3679 | |
3680 sub convert_phred64_quals_to_phred33{ | |
3681 | |
3682 my $qual = shift; | |
3683 my @quals = split (//,$qual); | |
3684 my @new_quals; | |
3685 | |
3686 foreach my $index (0..$#quals){ | |
3687 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]); | |
3688 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); | |
3689 $new_quals[$index] = $phred33_quality_string; | |
3690 } | |
3691 | |
3692 my $phred33_quality = join ("",@new_quals); | |
3693 return $phred33_quality; | |
3694 } | |
3695 | |
3696 sub convert_solexa_quals_to_phred33{ | |
3697 | |
3698 my $qual = shift; | |
3699 my @quals = split (//,$qual); | |
3700 my @new_quals; | |
3701 | |
3702 foreach my $index (0..$#quals){ | |
3703 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]); | |
3704 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); | |
3705 $new_quals[$index] = $phred33_quality_string; | |
3706 } | |
3707 | |
3708 my $phred33_quality = join ("",@new_quals); | |
3709 return $phred33_quality; | |
3710 } | |
3711 | |
3712 sub convert_phred_score_into_phred33_quality_string{ | |
3713 my $qual = shift; | |
3714 $qual = chr($qual+33); | |
3715 return $qual; | |
3716 } | |
3717 | |
3718 sub convert_phred64_quality_string_into_phred_score{ | |
3719 my $string = shift; | |
3720 my $qual = ord($string)-64; | |
3721 return $qual; | |
3722 } | |
3723 | |
3724 sub convert_solexa_pre1_3_quality_string_into_phred_score{ | |
3725 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10 | |
3726 my $string = shift; | |
3727 my $qual = ord($string)-59; | |
3728 return $qual; | |
3729 } | |
3730 | |
3731 | |
3732 sub extract_corresponding_genomic_sequence_single_end { | |
3733 my ($sequence_identifier,$methylation_call_params) = @_; | |
3734 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the | |
3735 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
3736 | |
3737 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
3738 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
3739 my $alignment_strand; | |
3740 my $read_conversion_info; | |
3741 my $genome_conversion; | |
3742 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and | |
3743 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, | |
3744 ### if the C happens to be at the last position of the actually observed sequence | |
3745 my $non_bisulfite_sequence; | |
3746 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end | |
3747 | |
3748 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) | |
3749 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ | |
3750 ### [Index 0, sequence originated from (converted) forward strand] | |
3751 $counting{CT_CT_count}++; | |
3752 $alignment_strand = '+'; | |
3753 $read_conversion_info = 'CT'; | |
3754 $genome_conversion = 'CT'; | |
3755 | |
3756 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3757 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 | |
3758 ### + 2 extra base at the 3' end | |
3759 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 | |
3760 } | |
3761 else{ | |
3762 $non_bisulfite_sequence = ''; | |
3763 } | |
3764 } | |
3765 | |
3766 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) | |
3767 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ | |
3768 ### [Index 1, sequence originated from (converted) reverse strand] | |
3769 $counting{CT_GA_count}++; | |
3770 $alignment_strand = '-'; | |
3771 $read_conversion_info = 'CT'; | |
3772 $genome_conversion = 'GA'; | |
3773 | |
3774 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3775 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > | |
3776 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation | |
3777 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 | |
3778 ## reverse complement! | |
3779 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
3780 } | |
3781 else{ | |
3782 $non_bisulfite_sequence = ''; | |
3783 } | |
3784 } | |
3785 | |
3786 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) | |
3787 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ | |
3788 ### [Index 2, sequence originated from complementary to (converted) forward strand] | |
3789 $counting{GA_CT_count}++; | |
3790 $alignment_strand = '-'; | |
3791 $read_conversion_info = 'GA'; | |
3792 $genome_conversion = 'CT'; | |
3793 | |
3794 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation | |
3795 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3796 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 | |
3797 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 | |
3798 ## reverse complement! | |
3799 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
3800 } | |
3801 else{ | |
3802 $non_bisulfite_sequence = ''; | |
3803 } | |
3804 } | |
3805 | |
3806 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) | |
3807 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ | |
3808 ### [Index 3, sequence originated from complementary to (converted) reverse strand] | |
3809 $counting{GA_GA_count}++; | |
3810 $alignment_strand = '+'; | |
3811 $read_conversion_info = 'GA'; | |
3812 $genome_conversion = 'GA'; | |
3813 | |
3814 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3815 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > | |
3816 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand | |
3817 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 | |
3818 } | |
3819 else{ | |
3820 $non_bisulfite_sequence = ''; | |
3821 } | |
3822 } | |
3823 else{ | |
3824 die "Too many bowtie result filehandles\n"; | |
3825 } | |
3826 | |
3827 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; | |
3828 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; | |
3829 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
3830 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
3831 | |
3832 ### at this point we can also determine the end position of a read | |
3833 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); | |
3834 } | |
3835 | |
3836 sub extract_corresponding_genomic_sequence_single_end_pbat { | |
3837 my ($sequence_identifier,$methylation_call_params) = @_; | |
3838 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the | |
3839 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
3840 | |
3841 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
3842 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
3843 my $alignment_strand; | |
3844 my $read_conversion_info; | |
3845 my $genome_conversion; | |
3846 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and | |
3847 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, | |
3848 ### if the C happens to be at the last position of the actually observed sequence | |
3849 my $non_bisulfite_sequence; | |
3850 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end | |
3851 | |
3852 my $pbat_index = $methylation_call_params->{$sequence_identifier}->{index} + 2; # (we are simply not running indexes 0 or 1! | |
3853 | |
3854 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) | |
3855 if ($pbat_index == 0){ | |
3856 ### [Index 0, sequence originated from (converted) forward strand] | |
3857 $counting{CT_CT_count}++; | |
3858 $alignment_strand = '+'; | |
3859 $read_conversion_info = 'CT'; | |
3860 $genome_conversion = 'CT'; | |
3861 | |
3862 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3863 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 | |
3864 ### + 2 extra base at the 3' end | |
3865 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 | |
3866 } | |
3867 else{ | |
3868 $non_bisulfite_sequence = ''; | |
3869 } | |
3870 } | |
3871 | |
3872 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) | |
3873 elsif ($pbat_index == 1){ | |
3874 ### [Index 1, sequence originated from (converted) reverse strand] | |
3875 $counting{CT_GA_count}++; | |
3876 $alignment_strand = '-'; | |
3877 $read_conversion_info = 'CT'; | |
3878 $genome_conversion = 'GA'; | |
3879 | |
3880 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3881 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > | |
3882 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation | |
3883 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 | |
3884 ## reverse complement! | |
3885 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
3886 } | |
3887 else{ | |
3888 $non_bisulfite_sequence = ''; | |
3889 } | |
3890 } | |
3891 | |
3892 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) | |
3893 elsif ($pbat_index == 2){ | |
3894 ### [Index 2, sequence originated from complementary to (converted) forward strand] | |
3895 $counting{GA_CT_count}++; | |
3896 $alignment_strand = '-'; | |
3897 $read_conversion_info = 'GA'; | |
3898 $genome_conversion = 'CT'; | |
3899 | |
3900 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation | |
3901 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3902 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 | |
3903 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 | |
3904 ## reverse complement! | |
3905 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
3906 } | |
3907 else{ | |
3908 $non_bisulfite_sequence = ''; | |
3909 } | |
3910 } | |
3911 | |
3912 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) | |
3913 elsif ($pbat_index == 3){ | |
3914 ### [Index 3, sequence originated from complementary to (converted) reverse strand] | |
3915 $counting{GA_GA_count}++; | |
3916 $alignment_strand = '+'; | |
3917 $read_conversion_info = 'GA'; | |
3918 $genome_conversion = 'GA'; | |
3919 | |
3920 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3921 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > | |
3922 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand | |
3923 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 | |
3924 } | |
3925 else{ | |
3926 $non_bisulfite_sequence = ''; | |
3927 } | |
3928 } | |
3929 else{ | |
3930 die "Too many bowtie result filehandles\n"; | |
3931 } | |
3932 | |
3933 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; | |
3934 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; | |
3935 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
3936 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
3937 | |
3938 ### at this point we can also determine the end position of a read | |
3939 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); | |
3940 } | |
3941 | |
3942 | |
3943 sub extract_corresponding_genomic_sequence_single_end_bowtie2{ | |
3944 my ($sequence_identifier,$methylation_call_params) = @_; | |
3945 | |
3946 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info}; | |
3947 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR}; | |
3948 | |
3949 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the | |
3950 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
3951 | |
3952 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
3953 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
3954 my $alignment_strand; | |
3955 my $read_conversion_info; | |
3956 my $genome_conversion; | |
3957 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and | |
3958 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence | |
3959 my $non_bisulfite_sequence = ''; | |
3960 | |
3961 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings | |
3962 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1; | |
3963 | |
3964 # parsing CIGAR string | |
3965 my @len = split (/\D+/,$cigar); # storing the length per operation | |
3966 my @ops = split (/\d+/,$cigar); # storing the operation | |
3967 shift @ops; # remove the empty first element | |
3968 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
3969 | |
3970 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3) | |
3971 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ | |
3972 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
3973 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise | |
3974 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
3975 return; | |
3976 } | |
3977 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2); | |
3978 } | |
3979 my $indels = 0; | |
3980 | |
3981 foreach (0..$#len){ | |
3982 if ($ops[$_] eq 'M'){ | |
3983 #extracting genomic sequence | |
3984 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); | |
3985 # adjusting position | |
3986 $pos += $len[$_]; | |
3987 } | |
3988 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence | |
3989 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls | |
3990 $non_bisulfite_sequence .= 'N' x $len[$_]; | |
3991 # warn "$non_bisulfite_sequence\n"; | |
3992 # position doesn't need to be adjusting | |
3993 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) | |
3994 } | |
3995 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence | |
3996 # we do not add any genomic sequence but only adjust the position | |
3997 $pos += $len[$_]; | |
3998 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) | |
3999 } | |
4000 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die | |
4001 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
4002 } | |
4003 else{ | |
4004 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
4005 } | |
4006 } | |
4007 | |
4008 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2) | |
4009 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ | |
4010 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
4011 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise | |
4012 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
4013 return; | |
4014 } | |
4015 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2); | |
4016 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n"; | |
4017 } | |
4018 | |
4019 | |
4020 | |
4021 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) | |
4022 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ | |
4023 ### [Index 0, sequence originated from (converted) forward strand] | |
4024 $counting{CT_CT_count}++; | |
4025 $alignment_strand = '+'; | |
4026 $read_conversion_info = 'CT'; | |
4027 $genome_conversion = 'CT'; | |
4028 } | |
4029 | |
4030 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) | |
4031 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ | |
4032 ### [Index 1, sequence originated from (converted) reverse strand] | |
4033 $counting{CT_GA_count}++; | |
4034 $alignment_strand = '-'; | |
4035 $read_conversion_info = 'CT'; | |
4036 $genome_conversion = 'GA'; | |
4037 | |
4038 ### reverse complement! | |
4039 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
4040 } | |
4041 | |
4042 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) | |
4043 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ | |
4044 ### [Index 2, sequence originated from complementary to (converted) forward strand] | |
4045 $counting{GA_CT_count}++; | |
4046 $alignment_strand = '-'; | |
4047 $read_conversion_info = 'GA'; | |
4048 $genome_conversion = 'CT'; | |
4049 | |
4050 ### reverse complement! | |
4051 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
4052 } | |
4053 | |
4054 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) | |
4055 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ | |
4056 ### [Index 3, sequence originated from complementary to (converted) reverse strand] | |
4057 $counting{GA_GA_count}++; | |
4058 $alignment_strand = '+'; | |
4059 $read_conversion_info = 'GA'; | |
4060 $genome_conversion = 'GA'; | |
4061 | |
4062 } | |
4063 else{ | |
4064 die "Too many Bowtie 2 result filehandles\n"; | |
4065 } | |
4066 | |
4067 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; | |
4068 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; | |
4069 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
4070 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
4071 | |
4072 ### the end position of a read is stored in $pos | |
4073 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos; | |
4074 $methylation_call_params->{$sequence_identifier}->{indels} = $indels; | |
4075 } | |
4076 | |
4077 ### METHYLATION CALL | |
4078 | |
4079 sub methylation_call{ | |
4080 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_; | |
4081 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one | |
4082 my @seq = split(//,$sequence_actually_observed); | |
4083 my @genomic = split(//,$genomic_sequence); | |
4084 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n"; | |
4085 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either | |
4086 ### CpG, CHH or CHG context | |
4087 | |
4088 ################################################################# | |
4089 ### . for bases not involving cytosines ### | |
4090 ### X for methylated C in CHG context (was protected) ### | |
4091 ### x for not methylated C in CHG context (was converted) ### | |
4092 ### H for methylated C in CHH context (was protected) ### | |
4093 ### h for not methylated C in CHH context (was converted) ### | |
4094 ### Z for methylated C in CpG context (was protected) ### | |
4095 ### z for not methylated C in CpG context (was converted) ### | |
4096 ### U for methylated C in unknown context (was protected) ### | |
4097 ### u for not methylated C in unknwon context (was converted) ### | |
4098 ################################################################# | |
4099 | |
4100 my @match =(); | |
4101 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 | |
4102 my $methyl_CHH_count = 0; | |
4103 my $methyl_CHG_count = 0; | |
4104 my $methyl_CpG_count = 0; | |
4105 my $methyl_C_unknown_count = 0; | |
4106 my $unmethylated_CHH_count = 0; | |
4107 my $unmethylated_CHG_count = 0; | |
4108 my $unmethylated_CpG_count = 0; | |
4109 my $unmethylated_C_unknown_count = 0; | |
4110 | |
4111 if ($read_conversion eq 'CT'){ | |
4112 for my $index (0..$#seq) { | |
4113 if ($seq[$index] eq $genomic[$index]) { | |
4114 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation | |
4115 if ($genomic[$index] eq 'C') { | |
4116 ### If the residue is a C we want to know if it was in CpG context or in any other context | |
4117 my $downstream_base = $genomic[$index+1]; | |
4118 | |
4119 if ($downstream_base eq 'G'){ | |
4120 ++$methyl_CpG_count; | |
4121 push @match,'Z'; # protected C, methylated, in CpG context | |
4122 } | |
4123 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
4124 ++$methyl_C_unknown_count; | |
4125 push @match,'U'; # protected C, methylated, in Unknown context | |
4126 } | |
4127 else { | |
4128 ### C in not in CpG-context, determining the second downstream base context | |
4129 my $second_downstream_base = $genomic[$index+2]; | |
4130 | |
4131 if ($second_downstream_base eq 'G'){ | |
4132 ++$methyl_CHG_count; | |
4133 push @match,'X'; # protected C, methylated, in CHG context | |
4134 } | |
4135 elsif ($second_downstream_base eq 'N'){ | |
4136 ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
4137 push @match,'U'; # protected C, methylated, in Unknown context | |
4138 } | |
4139 else{ | |
4140 ++$methyl_CHH_count; | |
4141 push @match,'H'; # protected C, methylated, in CHH context | |
4142 } | |
4143 } | |
4144 } | |
4145 else { | |
4146 push @match, '.'; | |
4147 } | |
4148 } | |
4149 elsif ($seq[$index] ne $genomic[$index]) { | |
4150 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts | |
4151 ### in the actually observed sequence | |
4152 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') { | |
4153 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context | |
4154 my $downstream_base = $genomic[$index+1]; | |
4155 | |
4156 if ($downstream_base eq 'G'){ | |
4157 ++$unmethylated_CpG_count; | |
4158 push @match,'z'; # converted C, not methylated, in CpG context | |
4159 } | |
4160 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
4161 ++$unmethylated_C_unknown_count; | |
4162 push @match,'u'; # converted C, not methylated, in Unknown context | |
4163 } | |
4164 else{ | |
4165 ### C in not in CpG-context, determining the second downstream base context | |
4166 my $second_downstream_base = $genomic[$index+2]; | |
4167 | |
4168 if ($second_downstream_base eq 'G'){ | |
4169 ++$unmethylated_CHG_count; | |
4170 push @match,'x'; # converted C, not methylated, in CHG context | |
4171 } | |
4172 elsif ($second_downstream_base eq 'N'){ | |
4173 ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
4174 push @match,'u'; # converted C, not methylated, in Unknown context | |
4175 } | |
4176 else{ | |
4177 ++$unmethylated_CHH_count; | |
4178 push @match,'h'; # converted C, not methylated, in CHH context | |
4179 } | |
4180 } | |
4181 } | |
4182 ### all other mismatches are not of interest for a methylation call | |
4183 else { | |
4184 push @match,'.'; | |
4185 } | |
4186 } | |
4187 else{ | |
4188 die "There can be only 2 possibilities\n"; | |
4189 } | |
4190 } | |
4191 } | |
4192 elsif ($read_conversion eq 'GA'){ | |
4193 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n"; | |
4194 | |
4195 for my $index (0..$#seq) { | |
4196 if ($seq[$index] eq $genomic[$index+2]) { | |
4197 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation | |
4198 if ($genomic[$index+2] eq 'G') { | |
4199 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need | |
4200 ### to look if the base upstream is a C | |
4201 | |
4202 my $upstream_base = $genomic[$index+1]; | |
4203 | |
4204 if ($upstream_base eq 'C'){ | |
4205 ++$methyl_CpG_count; | |
4206 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context | |
4207 } | |
4208 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
4209 ++$methyl_C_unknown_count; | |
4210 push @match,'U'; # protected C on opposing strand, methylated, in Unknown context | |
4211 } | |
4212 else{ | |
4213 ### C in not in CpG-context, determining the second upstream base context | |
4214 my $second_upstream_base = $genomic[$index]; | |
4215 | |
4216 if ($second_upstream_base eq 'C'){ | |
4217 ++$methyl_CHG_count; | |
4218 push @match,'X'; # protected C on opposing strand, methylated, in CHG context | |
4219 } | |
4220 elsif ($second_upstream_base eq 'N'){ | |
4221 ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
4222 push @match,'U'; # protected C, methylated, in Unknown context | |
4223 } | |
4224 else{ | |
4225 ++$methyl_CHH_count; | |
4226 push @match,'H'; # protected C on opposing strand, methylated, in CHH context | |
4227 } | |
4228 } | |
4229 } | |
4230 else{ | |
4231 push @match, '.'; | |
4232 } | |
4233 } | |
4234 elsif ($seq[$index] ne $genomic[$index+2]) { | |
4235 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts | |
4236 ### on the opposing strand, so G to A conversions in the actually observed sequence | |
4237 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') { | |
4238 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if | |
4239 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream! | |
4240 | |
4241 my $upstream_base = $genomic[$index+1]; | |
4242 | |
4243 if ($upstream_base eq 'C'){ | |
4244 ++$unmethylated_CpG_count; | |
4245 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context | |
4246 } | |
4247 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
4248 ++$unmethylated_C_unknown_count; | |
4249 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context | |
4250 } | |
4251 else{ | |
4252 ### C in not in CpG-context, determining the second upstream base context | |
4253 my $second_upstream_base = $genomic[$index]; | |
4254 | |
4255 if ($second_upstream_base eq 'C'){ | |
4256 ++$unmethylated_CHG_count; | |
4257 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context | |
4258 } | |
4259 elsif ($second_upstream_base eq 'N'){ | |
4260 ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
4261 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context | |
4262 } | |
4263 else{ | |
4264 ++$unmethylated_CHH_count; | |
4265 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context | |
4266 } | |
4267 } | |
4268 } | |
4269 ### all other mismatches are not of interest for a methylation call | |
4270 else { | |
4271 push @match,'.'; | |
4272 } | |
4273 } | |
4274 else{ | |
4275 die "There can be only 2 possibilities\n"; | |
4276 } | |
4277 } | |
4278 } | |
4279 else{ | |
4280 die "Strand conversion info is required to perform a methylation call\n"; | |
4281 } | |
4282 | |
4283 my $methylation_call = join ("",@match); | |
4284 | |
4285 $counting{total_meCHH_count} += $methyl_CHH_count; | |
4286 $counting{total_meCHG_count} += $methyl_CHG_count; | |
4287 $counting{total_meCpG_count} += $methyl_CpG_count; | |
4288 $counting{total_meC_unknown_count} += $methyl_C_unknown_count; | |
4289 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; | |
4290 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; | |
4291 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; | |
4292 $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; | |
4293 | |
4294 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; | |
4295 return $methylation_call; | |
4296 } | |
4297 | |
4298 sub read_genome_into_memory{ | |
4299 ## working directoy | |
4300 my $cwd = shift; | |
4301 ## reading in and storing the specified genome in the %chromosomes hash | |
4302 chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; | |
4303 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; | |
4304 | |
4305 my @chromosome_filenames = <*.fa>; | |
4306 | |
4307 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
4308 unless (@chromosome_filenames){ | |
4309 @chromosome_filenames = <*.fasta>; | |
4310 } | |
4311 | |
4312 unless (@chromosome_filenames){ | |
4313 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; | |
4314 } | |
4315 | |
4316 foreach my $chromosome_filename (@chromosome_filenames){ | |
4317 | |
4318 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; | |
4319 ### first line needs to be a fastA header | |
4320 my $first_line = <CHR_IN>; | |
4321 chomp $first_line; | |
4322 $first_line =~ s/\r//; | |
4323 | |
4324 ### Extracting chromosome name from the FastA header | |
4325 my $chromosome_name = extract_chromosome_name($first_line); | |
4326 | |
4327 my $sequence; | |
4328 while (<CHR_IN>){ | |
4329 chomp; | |
4330 $_ =~ s/\r//; | |
4331 if ($_ =~ /^>/){ | |
4332 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) | |
4333 if (exists $chromosomes{$chromosome_name}){ | |
4334 print "chr $chromosome_name (",length $sequence ," bp)\n"; | |
4335 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; | |
4336 } | |
4337 else { | |
4338 if (length($sequence) == 0){ | |
4339 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; | |
4340 } | |
4341 print "chr $chromosome_name (",length $sequence ," bp)\n"; | |
4342 $chromosomes{$chromosome_name} = $sequence; | |
4343 } | |
4344 ### resetting the sequence variable | |
4345 $sequence = ''; | |
4346 ### setting new chromosome name | |
4347 $chromosome_name = extract_chromosome_name($_); | |
4348 } | |
4349 else{ | |
4350 $sequence .= uc$_; | |
4351 } | |
4352 } | |
4353 | |
4354 if (exists $chromosomes{$chromosome_name}){ | |
4355 print "chr $chromosome_name (",length $sequence ," bp)\t"; | |
4356 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; | |
4357 } | |
4358 else{ | |
4359 if (length($sequence) == 0){ | |
4360 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; | |
4361 } | |
4362 print "chr $chromosome_name (",length $sequence ," bp)\n"; | |
4363 $chromosomes{$chromosome_name} = $sequence; | |
4364 } | |
4365 } | |
4366 print "\n"; | |
4367 chdir $cwd or die "Failed to move to directory $cwd\n"; | |
4368 } | |
4369 | |
4370 sub extract_chromosome_name { | |
4371 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well | |
4372 my $fasta_header = shift; | |
4373 if ($fasta_header =~ s/^>//){ | |
4374 my ($chromosome_name) = split (/\s+/,$fasta_header); | |
4375 return $chromosome_name; | |
4376 } | |
4377 else{ | |
4378 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; | |
4379 } | |
4380 } | |
4381 | |
4382 sub reverse_complement{ | |
4383 my $sequence = shift; | |
4384 $sequence =~ tr/CATG/GTAC/; | |
4385 $sequence = reverse($sequence); | |
4386 return $sequence; | |
4387 } | |
4388 | |
4389 sub biTransformFastAFiles { | |
4390 my $file = shift; | |
4391 my ($dir,$filename); | |
4392 if ($file =~ /\//){ | |
4393 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
4394 } | |
4395 else{ | |
4396 $filename = $file; | |
4397 } | |
4398 | |
4399 ### gzipped version of the infile | |
4400 if ($file =~ /\.gz$/){ | |
4401 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; | |
4402 } | |
4403 else{ | |
4404 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
4405 } | |
4406 | |
4407 if ($skip){ | |
4408 warn "Skipping the first $skip reads from $file\n"; | |
4409 sleep (1); | |
4410 } | |
4411 if ($upto){ | |
4412 warn "Processing reads up to sequence no. $upto from $file\n"; | |
4413 sleep (1); | |
4414 } | |
4415 | |
4416 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
4417 | |
4418 if ($gzip){ | |
4419 $C_to_T_infile =~ s/$/_C_to_T.fa.gz/; | |
4420 $G_to_A_infile =~ s/$/_G_to_A.fa.gz/; | |
4421 } | |
4422 else{ | |
4423 $C_to_T_infile =~ s/$/_C_to_T.fa/; | |
4424 $G_to_A_infile =~ s/$/_G_to_A.fa/; | |
4425 } | |
4426 | |
4427 if ($prefix){ | |
4428 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
4429 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
4430 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
4431 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
4432 } | |
4433 | |
4434 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
4435 | |
4436 if ($gzip){ | |
4437 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
4438 } | |
4439 else{ | |
4440 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
4441 } | |
4442 | |
4443 unless ($directional){ | |
4444 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
4445 if ($gzip){ | |
4446 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
4447 } | |
4448 else{ | |
4449 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
4450 } | |
4451 } | |
4452 | |
4453 my $count = 0; | |
4454 | |
4455 while (1){ | |
4456 my $header = <IN>; | |
4457 my $sequence= <IN>; | |
4458 last unless ($header and $sequence); | |
4459 | |
4460 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces | |
4461 | |
4462 ++$count; | |
4463 | |
4464 if ($skip){ | |
4465 next unless ($count > $skip); | |
4466 } | |
4467 if ($upto){ | |
4468 last if ($count > $upto); | |
4469 } | |
4470 | |
4471 $sequence = uc$sequence; # make input file case insensitive | |
4472 | |
4473 # detecting if the input file contains tab stops, as this is likely to result in no alignments | |
4474 if (index($header,"\t") != -1){ | |
4475 $seqID_contains_tabs++; | |
4476 } | |
4477 | |
4478 ### small check if the sequence seems to be in FastA format | |
4479 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); | |
4480 | |
4481 my $sequence_C_to_T = $sequence; | |
4482 $sequence_C_to_T =~ tr/C/T/; | |
4483 print CTOT "$header$sequence_C_to_T"; | |
4484 | |
4485 unless ($directional){ | |
4486 my $sequence_G_to_A = $sequence; | |
4487 $sequence_G_to_A =~ tr/G/A/; | |
4488 print GTOA "$header$sequence_G_to_A"; | |
4489 } | |
4490 } | |
4491 close CTOT or die "Failed to close filehandle $!\n"; | |
4492 | |
4493 if ($directional){ | |
4494 warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n"; | |
4495 } | |
4496 else{ | |
4497 close GTOA or die "Failed to close filehandle $!\n"; | |
4498 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; | |
4499 } | |
4500 return ($C_to_T_infile,$G_to_A_infile); | |
4501 } | |
4502 | |
4503 sub biTransformFastAFiles_paired_end { | |
4504 my ($file,$read_number) = @_; | |
4505 | |
4506 if ($gzip){ | |
4507 warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n"; | |
4508 sleep (2); | |
4509 } | |
4510 | |
4511 my ($dir,$filename); | |
4512 if ($file =~ /\//){ | |
4513 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
4514 } | |
4515 else{ | |
4516 $filename = $file; | |
4517 } | |
4518 | |
4519 ### gzipped version of the infile | |
4520 if ($file =~ /\.gz$/){ | |
4521 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; | |
4522 } | |
4523 else{ | |
4524 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
4525 } | |
4526 | |
4527 if ($skip){ | |
4528 warn "Skipping the first $skip reads from $file\n"; | |
4529 sleep (1); | |
4530 } | |
4531 if ($upto){ | |
4532 warn "Processing reads up to sequence no. $upto from $file\n"; | |
4533 sleep (1); | |
4534 } | |
4535 | |
4536 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
4537 | |
4538 $C_to_T_infile =~ s/$/_C_to_T.fa/; | |
4539 $G_to_A_infile =~ s/$/_G_to_A.fa/; | |
4540 | |
4541 if ($prefix){ | |
4542 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
4543 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
4544 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
4545 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
4546 } | |
4547 | |
4548 if ($directional){ | |
4549 if ($read_number == 1){ | |
4550 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
4551 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
4552 } | |
4553 elsif ($read_number == 2){ | |
4554 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
4555 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
4556 } | |
4557 else{ | |
4558 die "Read number needs to be 1 or 2, but was: $read_number\n\n"; | |
4559 } | |
4560 } | |
4561 else{ # all four strand output | |
4562 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
4563 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
4564 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
4565 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
4566 } | |
4567 | |
4568 my $count = 0; | |
4569 | |
4570 while (1){ | |
4571 my $header = <IN>; | |
4572 my $sequence= <IN>; | |
4573 last unless ($header and $sequence); | |
4574 | |
4575 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces | |
4576 | |
4577 ++$count; | |
4578 | |
4579 if ($skip){ | |
4580 next unless ($count > $skip); | |
4581 } | |
4582 if ($upto){ | |
4583 last if ($count > $upto); | |
4584 } | |
4585 | |
4586 $sequence = uc$sequence; # make input file case insensitive | |
4587 | |
4588 # detecting if the input file contains tab stops, as this is likely to result in no alignments | |
4589 if (index($header,"\t") != -1){ | |
4590 $seqID_contains_tabs++; | |
4591 } | |
4592 | |
4593 ## small check if the sequence seems to be in FastA format | |
4594 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/); | |
4595 | |
4596 if ($read_number == 1){ | |
4597 if ($bowtie2){ | |
4598 $header =~ s/$/\/1\/1/; | |
4599 } | |
4600 else{ | |
4601 $header =~ s/$/\/1/; | |
4602 } | |
4603 } | |
4604 elsif ($read_number == 2){ | |
4605 if ($bowtie2){ | |
4606 $header =~ s/$/\/2\/2/; | |
4607 } | |
4608 else{ | |
4609 $header =~ s/$/\/2/; | |
4610 } | |
4611 } | |
4612 else{ | |
4613 die "Read number needs to be 1 or 2, but was: $read_number\n\n"; | |
4614 } | |
4615 my $sequence_C_to_T = my $sequence_G_to_A = $sequence; | |
4616 | |
4617 $sequence_C_to_T =~ tr/C/T/; | |
4618 $sequence_G_to_A =~ tr/G/A/; | |
4619 | |
4620 if ($directional){ | |
4621 | |
4622 if ($read_number == 1){ | |
4623 print CTOT "$header$sequence_C_to_T"; | |
4624 } | |
4625 elsif ($read_number == 2){ | |
4626 print GTOA "$header$sequence_G_to_A"; | |
4627 } | |
4628 } | |
4629 else{ | |
4630 print CTOT "$header$sequence_C_to_T"; | |
4631 print GTOA "$header$sequence_G_to_A"; | |
4632 } | |
4633 } | |
4634 | |
4635 if ($directional){ | |
4636 if ($read_number == 1){ | |
4637 warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n"; | |
4638 } | |
4639 else{ | |
4640 warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n"; | |
4641 } | |
4642 } | |
4643 else{ | |
4644 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; | |
4645 } | |
4646 | |
4647 if ($directional){ | |
4648 if ($read_number == 1){ | |
4649 return ($C_to_T_infile); | |
4650 } | |
4651 else{ | |
4652 return ($G_to_A_infile); | |
4653 } | |
4654 } | |
4655 else{ | |
4656 return ($C_to_T_infile,$G_to_A_infile); | |
4657 } | |
4658 } | |
4659 | |
4660 | |
4661 sub biTransformFastQFiles { | |
4662 my $file = shift; | |
4663 my ($dir,$filename); | |
4664 if ($file =~ /\//){ | |
4665 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
4666 } | |
4667 else{ | |
4668 $filename = $file; | |
4669 } | |
4670 | |
4671 ### gzipped version of the infile | |
4672 if ($file =~ /\.gz$/){ | |
4673 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; | |
4674 } | |
4675 else{ | |
4676 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
4677 } | |
4678 | |
4679 if ($skip){ | |
4680 warn "Skipping the first $skip reads from $file\n"; | |
4681 sleep (1); | |
4682 } | |
4683 if ($upto){ | |
4684 warn "Processing reads up to sequence no. $upto from $file\n"; | |
4685 sleep (1); | |
4686 } | |
4687 | |
4688 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
4689 | |
4690 if ($prefix){ | |
4691 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
4692 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
4693 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
4694 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
4695 } | |
4696 | |
4697 if ($pbat){ # PBAT-Seq | |
4698 if ($gzip){ | |
4699 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | |
4700 } | |
4701 else{ | |
4702 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | |
4703 } | |
4704 | |
4705 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
4706 | |
4707 if ($gzip){ | |
4708 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
4709 } | |
4710 else{ | |
4711 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
4712 } | |
4713 } | |
4714 else{ # directional or non-directional | |
4715 if ($gzip){ | |
4716 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; | |
4717 } | |
4718 else{ | |
4719 $C_to_T_infile =~ s/$/_C_to_T.fastq/; | |
4720 } | |
4721 | |
4722 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
4723 | |
4724 if ($gzip){ | |
4725 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
4726 } | |
4727 else{ | |
4728 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option | |
4729 } | |
4730 | |
4731 unless ($directional){ | |
4732 if ($gzip){ | |
4733 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | |
4734 } | |
4735 else{ | |
4736 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | |
4737 } | |
4738 | |
4739 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
4740 | |
4741 if ($gzip){ | |
4742 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
4743 } | |
4744 else{ | |
4745 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
4746 } | |
4747 } | |
4748 } | |
4749 | |
4750 my $count = 0; | |
4751 while (1){ | |
4752 my $identifier = <IN>; | |
4753 my $sequence = <IN>; | |
4754 my $identifier2 = <IN>; | |
4755 my $quality_score = <IN>; | |
4756 last unless ($identifier and $sequence and $identifier2 and $quality_score); | |
4757 | |
4758 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
4759 | |
4760 ++$count; | |
4761 | |
4762 if ($skip){ | |
4763 next unless ($count > $skip); | |
4764 } | |
4765 if ($upto){ | |
4766 last if ($count > $upto); | |
4767 } | |
4768 | |
4769 $sequence = uc$sequence; # make input file case insensitive | |
4770 | |
4771 # detecting if the input file contains tab stops, as this is likely to result in no alignments | |
4772 if (index($identifier,"\t") != -1){ | |
4773 $seqID_contains_tabs++; | |
4774 } | |
4775 | |
4776 ## small check if the sequence file appears to be a FastQ file | |
4777 if ($count == 1){ | |
4778 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ | |
4779 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
4780 } | |
4781 } | |
4782 | |
4783 if ($pbat){ | |
4784 my $sequence_G_to_A = $sequence; | |
4785 $sequence_G_to_A =~ tr/G/A/; | |
4786 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
4787 } | |
4788 else{ # directional or non-directional | |
4789 my $sequence_C_to_T = $sequence; | |
4790 $sequence_C_to_T =~ tr/C/T/; | |
4791 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); | |
4792 | |
4793 unless ($directional){ | |
4794 my $sequence_G_to_A = $sequence; | |
4795 $sequence_G_to_A =~ tr/G/A/; | |
4796 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
4797 } | |
4798 } | |
4799 } | |
4800 | |
4801 if ($directional){ | |
4802 close CTOT or die "Failed to close filehandle $!\n"; | |
4803 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
4804 } | |
4805 elsif($pbat){ | |
4806 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
4807 close GTOA or die "Failed to close filehandle $!\n"; | |
4808 return ($G_to_A_infile); | |
4809 } | |
4810 else{ | |
4811 close CTOT or die "Failed to close filehandle $!\n"; | |
4812 close GTOA or die "Failed to close filehandle $!\n"; | |
4813 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; | |
4814 } | |
4815 | |
4816 return ($C_to_T_infile,$G_to_A_infile); | |
4817 } | |
4818 | |
4819 sub biTransformFastQFiles_paired_end { | |
4820 my ($file,$read_number) = @_; | |
4821 my ($dir,$filename); | |
4822 | |
4823 if ($file =~ /\//){ | |
4824 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
4825 } | |
4826 else{ | |
4827 $filename = $file; | |
4828 } | |
4829 | |
4830 ### gzipped version of the infile | |
4831 if ($file =~ /\.gz$/){ | |
4832 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; | |
4833 } | |
4834 else{ | |
4835 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
4836 } | |
4837 | |
4838 if ($skip){ | |
4839 warn "Skipping the first $skip reads from $file\n"; | |
4840 sleep (1); | |
4841 } | |
4842 if ($upto){ | |
4843 warn "Processing reads up to sequence no. $upto from $file\n"; | |
4844 sleep (1); | |
4845 } | |
4846 | |
4847 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
4848 | |
4849 if ($gzip){ | |
4850 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; | |
4851 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | |
4852 } | |
4853 else{ | |
4854 $C_to_T_infile =~ s/$/_C_to_T.fastq/; | |
4855 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | |
4856 } | |
4857 | |
4858 if ($prefix){ | |
4859 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
4860 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
4861 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
4862 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
4863 } | |
4864 | |
4865 if ($directional){ | |
4866 if ($read_number == 1){ | |
4867 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
4868 if ($gzip){ | |
4869 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
4870 } | |
4871 else{ | |
4872 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
4873 } | |
4874 } | |
4875 elsif ($read_number == 2){ | |
4876 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
4877 if ($gzip){ | |
4878 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
4879 } | |
4880 else{ | |
4881 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
4882 } | |
4883 } | |
4884 else{ | |
4885 die "Read number needs to be 1 or 2, but was $read_number!\n\n"; | |
4886 } | |
4887 } | |
4888 else{ | |
4889 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
4890 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
4891 if ($gzip){ | |
4892 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
4893 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
4894 } | |
4895 else{ | |
4896 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
4897 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
4898 } | |
4899 } | |
4900 | |
4901 my $count = 0; | |
4902 while (1){ | |
4903 my $identifier = <IN>; | |
4904 my $sequence = <IN>; | |
4905 my $identifier2 = <IN>; | |
4906 my $quality_score = <IN>; | |
4907 last unless ($identifier and $sequence and $identifier2 and $quality_score); | |
4908 ++$count; | |
4909 | |
4910 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
4911 | |
4912 if ($skip){ | |
4913 next unless ($count > $skip); | |
4914 } | |
4915 if ($upto){ | |
4916 last if ($count > $upto); | |
4917 } | |
4918 | |
4919 $sequence= uc$sequence; # make input file case insensitive | |
4920 | |
4921 ## small check if the sequence file appears to be a FastQ file | |
4922 if ($count == 1){ | |
4923 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ | |
4924 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
4925 } | |
4926 } | |
4927 my $sequence_C_to_T = my $sequence_G_to_A = $sequence; | |
4928 | |
4929 if ($read_number == 1){ | |
4930 if ($bowtie2){ | |
4931 $identifier =~ s/$/\/1\/1/; | |
4932 } | |
4933 else{ | |
4934 $identifier =~ s/$/\/1/; | |
4935 } | |
4936 } | |
4937 elsif ($read_number == 2){ | |
4938 if ($bowtie2){ | |
4939 $identifier =~ s/$/\/2\/2/; | |
4940 } | |
4941 else{ | |
4942 $identifier =~ s/$/\/2/; | |
4943 } | |
4944 } | |
4945 else{ | |
4946 die "Read number needs to be 1 or 2\n"; | |
4947 } | |
4948 | |
4949 $sequence_C_to_T =~ tr/C/T/; | |
4950 $sequence_G_to_A =~ tr/G/A/; | |
4951 | |
4952 if ($directional){ | |
4953 if ($read_number == 1){ | |
4954 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); | |
4955 } | |
4956 else{ | |
4957 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
4958 } | |
4959 } | |
4960 else{ | |
4961 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); | |
4962 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
4963 } | |
4964 } | |
4965 | |
4966 if ($directional){ | |
4967 if ($read_number == 1){ | |
4968 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
4969 } | |
4970 else{ | |
4971 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
4972 } | |
4973 } | |
4974 else{ | |
4975 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; | |
4976 } | |
4977 if ($directional){ | |
4978 if ($read_number == 1){ | |
4979 close CTOT or die "Failed to close filehandle $!\n"; | |
4980 return ($C_to_T_infile); | |
4981 } | |
4982 else{ | |
4983 close GTOA or die "Failed to close filehandle $!\n"; | |
4984 return ($G_to_A_infile); | |
4985 } | |
4986 } | |
4987 else{ | |
4988 close CTOT or die "Failed to close filehandle $!\n"; | |
4989 close GTOA or die "Failed to close filehandle $!\n"; | |
4990 return ($C_to_T_infile,$G_to_A_infile); | |
4991 } | |
4992 } | |
4993 | |
4994 | |
4995 ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES | |
4996 | |
4997 sub biTransformFastQFiles_paired_end_bowtie1_gzip { | |
4998 my ($file_1,$file_2) = @_; | |
4999 my ($dir,$filename); | |
5000 | |
5001 if ($file_1 =~ /\//){ | |
5002 ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/; | |
5003 } | |
5004 else{ | |
5005 $filename = $file_1; | |
5006 } | |
5007 | |
5008 ### gzipped version of infile 1 | |
5009 if ($file_1 =~ /\.gz$/){ | |
5010 open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n"; | |
5011 } | |
5012 else{ | |
5013 open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n"; | |
5014 } | |
5015 ### gzipped version of infile 2 | |
5016 if ($file_2 =~ /\.gz$/){ | |
5017 open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n"; | |
5018 } | |
5019 else{ | |
5020 open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n"; | |
5021 } | |
5022 | |
5023 | |
5024 if ($skip){ | |
5025 warn "Skipping the first $skip reads from $file_1 and $file_2\n"; | |
5026 sleep (1); | |
5027 } | |
5028 if ($upto){ | |
5029 warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n"; | |
5030 sleep (1); | |
5031 } | |
5032 | |
5033 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; | |
5034 | |
5035 if ($prefix){ | |
5036 # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; | |
5037 $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; | |
5038 $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; | |
5039 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; | |
5040 } | |
5041 | |
5042 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; | |
5043 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; | |
5044 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; | |
5045 | |
5046 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; | |
5047 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; | |
5048 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; | |
5049 | |
5050 unless ($directional){ | |
5051 print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n"; | |
5052 open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n"; | |
5053 } | |
5054 | |
5055 ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format: | |
5056 ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate> | |
5057 | |
5058 my $count = 0; | |
5059 while (1){ | |
5060 my $identifier_1 = <IN_1>; | |
5061 my $sequence_1 = <IN_1>; | |
5062 my $identifier2_1 = <IN_1>; | |
5063 my $quality_score_1 = <IN_1>; | |
5064 | |
5065 my $identifier_2 = <IN_2>; | |
5066 my $sequence_2 = <IN_2>; | |
5067 my $identifier2_2 = <IN_2>; | |
5068 my $quality_score_2 = <IN_2>; | |
5069 | |
5070 last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2); | |
5071 | |
5072 ++$count; | |
5073 | |
5074 ## small check if the sequence file appears to be a FastQ file | |
5075 if ($count == 1){ | |
5076 if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){ | |
5077 die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
5078 } | |
5079 if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){ | |
5080 die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
5081 } | |
5082 } | |
5083 | |
5084 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces | |
5085 chomp $identifier_1; | |
5086 chomp $sequence_1; | |
5087 chomp $sequence_2; | |
5088 chomp $quality_score_1; | |
5089 chomp $quality_score_2; | |
5090 | |
5091 $identifier_1 =~ s/^\@//; | |
5092 $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever) | |
5093 | |
5094 if ($skip){ | |
5095 next unless ($count > $skip); | |
5096 } | |
5097 if ($upto){ | |
5098 last if ($count > $upto); | |
5099 } | |
5100 | |
5101 $sequence_1 = uc$sequence_1; # make input file 1 case insensitive | |
5102 $sequence_2 = uc$sequence_2; # make input file 2 case insensitive | |
5103 | |
5104 # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n"; | |
5105 my $sequence_1_C_to_T = $sequence_1; | |
5106 my $sequence_2_G_to_A = $sequence_2; | |
5107 $sequence_1_C_to_T =~ tr/C/T/; | |
5108 $sequence_2_G_to_A =~ tr/G/A/; | |
5109 | |
5110 print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n"; | |
5111 | |
5112 unless ($directional){ | |
5113 my $sequence_1_G_to_A = $sequence_1; | |
5114 my $sequence_2_C_to_T = $sequence_2; | |
5115 $sequence_1_G_to_A =~ tr/G/A/; | |
5116 $sequence_2_C_to_T =~ tr/C/T/; | |
5117 print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n"; | |
5118 } | |
5119 } | |
5120 | |
5121 close CTPLUSGA or die "Couldn't close filehandle\n"; | |
5122 warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n"; | |
5123 | |
5124 if ($directional){ | |
5125 warn "\n"; | |
5126 return ($CT_plus_GA_infile); | |
5127 } | |
5128 else{ | |
5129 close GAPLUSCT or die "Couldn't close filehandle\n"; | |
5130 warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n"; | |
5131 return ($CT_plus_GA_infile,$GA_plus_CT_infile); | |
5132 } | |
5133 } | |
5134 | |
5135 | |
5136 sub fix_IDs{ | |
5137 my $id = shift; | |
5138 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores | |
5139 return $id; | |
5140 } | |
5141 | |
5142 sub ensure_sensical_alignment_orientation_single_end{ | |
5143 my $index = shift; # index number if the sequence produced an alignment | |
5144 my $strand = shift; | |
5145 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one | |
5146 my $orientation = 0; | |
5147 ############################################################################################################## | |
5148 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T) | |
5149 ## here we only want reads in the forward (+) orientation | |
5150 if ($fhs[$index]->{name} eq 'CTreadCTgenome') { | |
5151 ### if the alignment is (+) we count it, and return 1 for a correct orientation | |
5152 if ($strand eq '+') { | |
5153 $fhs[$index]->{seen}++; | |
5154 $orientation = 1; | |
5155 return $orientation; | |
5156 } | |
5157 ### if the orientation equals (-) the alignment is nonsensical | |
5158 elsif ($strand eq '-') { | |
5159 $fhs[$index]->{wrong_strand}++; | |
5160 return $orientation; | |
5161 } | |
5162 } | |
5163 ############################################################################################################### | |
5164 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A) | |
5165 ## here we only want reads in the forward (-) orientation | |
5166 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') { | |
5167 ### if the alignment is (-) we count it and return 1 for a correct orientation | |
5168 if ($strand eq '-') { | |
5169 $fhs[$index]->{seen}++; | |
5170 $orientation = 1; | |
5171 return $orientation; | |
5172 } | |
5173 ### if the orientation equals (+) the alignment is nonsensical | |
5174 elsif ($strand eq '+') { | |
5175 $fhs[$index]->{wrong_strand}++; | |
5176 return $orientation; | |
5177 } | |
5178 } | |
5179 ############################################################################################################### | |
5180 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T) | |
5181 ## here we only want reads in the forward (-) orientation | |
5182 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') { | |
5183 ### if the alignment is (-) we count it and return 1 for a correct orientation | |
5184 if ($strand eq '-') { | |
5185 $fhs[$index]->{seen}++; | |
5186 $orientation = 1; | |
5187 return $orientation; | |
5188 } | |
5189 ### if the orientation equals (+) the alignment is nonsensical | |
5190 elsif ($strand eq '+') { | |
5191 $fhs[$index]->{wrong_strand}++; | |
5192 return $orientation; | |
5193 } | |
5194 } | |
5195 ############################################################################################################### | |
5196 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A) | |
5197 ## here we only want reads in the forward (+) orientation | |
5198 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') { | |
5199 ### if the alignment is (+) we count it and return 1 for a correct orientation | |
5200 if ($strand eq '+') { | |
5201 $fhs[$index]->{seen}++; | |
5202 $orientation = 1; | |
5203 return $orientation; | |
5204 } | |
5205 ### if the orientation equals (-) the alignment is nonsensical | |
5206 elsif ($strand eq '-') { | |
5207 $fhs[$index]->{wrong_strand}++; | |
5208 return $orientation; | |
5209 } | |
5210 } else{ | |
5211 die "One of the above conditions must be true\n"; | |
5212 } | |
5213 } | |
5214 | |
5215 sub ensure_sensical_alignment_orientation_paired_ends{ | |
5216 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment | |
5217 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one | |
5218 my $orientation = 0; | |
5219 ############################################################################################################## | |
5220 ## [Index 0, sequence originated from (converted) forward strand] | |
5221 ## CT converted read 1 | |
5222 ## GA converted read 2 | |
5223 ## CT converted genome | |
5224 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation | |
5225 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') { | |
5226 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation | |
5227 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
5228 $fhs[$index]->{seen}++; | |
5229 $orientation = 1; | |
5230 return $orientation; | |
5231 } | |
5232 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
5233 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
5234 $fhs[$index]->{wrong_strand}++; | |
5235 return $orientation; | |
5236 } | |
5237 else{ | |
5238 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
5239 } | |
5240 } | |
5241 ############################################################################################################### | |
5242 ## [Index 1, sequence originated from (converted) reverse strand] | |
5243 ## GA converted read 1 | |
5244 ## CT converted read 2 | |
5245 ## GA converted genome | |
5246 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation | |
5247 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') { | |
5248 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation | |
5249 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
5250 $fhs[$index]->{seen}++; | |
5251 $orientation = 1; | |
5252 return $orientation; | |
5253 } | |
5254 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
5255 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
5256 $fhs[$index]->{wrong_strand}++; | |
5257 return $orientation; | |
5258 } | |
5259 else{ | |
5260 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
5261 } | |
5262 } | |
5263 ############################################################################################################### | |
5264 ## [Index 2, sequence originated from complementary to (converted) forward strand] | |
5265 ## GA converted read 1 | |
5266 ## CT converted read 2 | |
5267 ## CT converted genome | |
5268 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation | |
5269 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') { | |
5270 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation | |
5271 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
5272 $fhs[$index]->{seen}++; | |
5273 $orientation = 1; | |
5274 return $orientation; | |
5275 } | |
5276 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
5277 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
5278 $fhs[$index]->{wrong_strand}++; | |
5279 return $orientation; | |
5280 } | |
5281 else{ | |
5282 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
5283 } | |
5284 } | |
5285 ############################################################################################################### | |
5286 ## [Index 3, sequence originated from complementary to (converted) reverse strand] | |
5287 ## CT converted read 1 | |
5288 ## GA converted read 2 | |
5289 ## GA converted genome | |
5290 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation | |
5291 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') { | |
5292 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation | |
5293 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
5294 $fhs[$index]->{seen}++; | |
5295 $orientation = 1; | |
5296 return $orientation; | |
5297 } | |
5298 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
5299 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
5300 $fhs[$index]->{wrong_strand}++; | |
5301 return $orientation; | |
5302 } | |
5303 else{ | |
5304 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
5305 } | |
5306 } | |
5307 else{ | |
5308 die "One of the above conditions must be true\n"; | |
5309 } | |
5310 } | |
5311 | |
5312 ##################################################################################################################################################### | |
5313 | |
5314 ### Bowtie 1 (default) | PAIRED-END | FASTA | |
5315 | |
5316 sub paired_end_align_fragments_to_bisulfite_genome_fastA { | |
5317 | |
5318 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
5319 | |
5320 if ($directional){ | |
5321 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; | |
5322 } | |
5323 else{ | |
5324 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; | |
5325 } | |
5326 | |
5327 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
5328 ## data structure above | |
5329 if ($directional){ | |
5330 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5331 } | |
5332 else{ | |
5333 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5334 } | |
5335 | |
5336 foreach my $fh (@fhs) { | |
5337 | |
5338 if ($directional){ | |
5339 unless ($fh->{inputfile_1}){ | |
5340 $fh->{last_seq_id} = undef; | |
5341 $fh->{last_line_1} = undef; | |
5342 $fh->{last_line_2} = undef; | |
5343 next; | |
5344 } | |
5345 } | |
5346 | |
5347 my $bt_options = $bowtie_options; | |
5348 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
5349 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
5350 } | |
5351 else { | |
5352 $bt_options .= ' --nofw'; | |
5353 } | |
5354 | |
5355 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n"; | |
5356 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
5357 | |
5358 my $line_1 = $fh->{fh}->getline(); | |
5359 my $line_2 = $fh->{fh}->getline(); | |
5360 | |
5361 # if Bowtie produces an alignment we store the first line of the output | |
5362 if ($line_1 and $line_2) { | |
5363 chomp $line_1; | |
5364 chomp $line_2; | |
5365 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
5366 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
5367 | |
5368 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
5369 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
5370 | |
5371 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present | |
5372 $fh->{last_seq_id} = $id_1; | |
5373 } | |
5374 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present | |
5375 $fh->{last_seq_id} = $id_2; | |
5376 } | |
5377 else{ | |
5378 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
5379 } | |
5380 | |
5381 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 | |
5382 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 | |
5383 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
5384 } | |
5385 # otherwise we just initialise last_seq_id and last_lines as undefined | |
5386 else { | |
5387 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
5388 $fh->{last_seq_id} = undef; | |
5389 $fh->{last_line_1} = undef; | |
5390 $fh->{last_line_2} = undef; | |
5391 } | |
5392 } | |
5393 } | |
5394 | |
5395 ### Bowtie 2 | PAIRED-END | FASTA | |
5396 | |
5397 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { | |
5398 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
5399 if ($directional){ | |
5400 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; | |
5401 } | |
5402 else{ | |
5403 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; | |
5404 } | |
5405 | |
5406 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
5407 ## data structure above | |
5408 if ($directional){ | |
5409 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5410 } | |
5411 else{ | |
5412 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5413 } | |
5414 | |
5415 foreach my $fh (@fhs) { | |
5416 | |
5417 if ($directional){ | |
5418 unless ($fh->{inputfile_1}){ | |
5419 $fh->{last_seq_id} = undef; | |
5420 $fh->{last_line_1} = undef; | |
5421 $fh->{last_line_2} = undef; | |
5422 next; | |
5423 } | |
5424 } | |
5425 | |
5426 my $bt2_options = $bowtie_options; | |
5427 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
5428 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
5429 } | |
5430 else { | |
5431 $bt2_options .= ' --nofw'; | |
5432 } | |
5433 | |
5434 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; | |
5435 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
5436 | |
5437 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
5438 while (1){ | |
5439 $_ = $fh->{fh}->getline(); | |
5440 if ($_) { | |
5441 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
5442 } | |
5443 else{ | |
5444 last; # no alignment output | |
5445 } | |
5446 } | |
5447 | |
5448 my $line_1 = $_; | |
5449 my $line_2 = $fh->{fh}->getline(); | |
5450 | |
5451 # if Bowtie produces an alignment we store the first line of the output | |
5452 if ($line_1 and $line_2) { | |
5453 chomp $line_1; | |
5454 chomp $line_2; | |
5455 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
5456 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
5457 | |
5458 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
5459 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
5460 | |
5461 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with | |
5462 $fh->{last_seq_id} = $id_1; | |
5463 } | |
5464 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present | |
5465 $fh->{last_seq_id} = $id_2; | |
5466 } | |
5467 else{ | |
5468 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
5469 } | |
5470 | |
5471 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 | |
5472 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 | |
5473 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
5474 } | |
5475 # otherwise we just initialise last_seq_id and last_lines as undefined | |
5476 else { | |
5477 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
5478 $fh->{last_seq_id} = undef; | |
5479 $fh->{last_line_1} = undef; | |
5480 $fh->{last_line_2} = undef; | |
5481 } | |
5482 } | |
5483 } | |
5484 | |
5485 ### Bowtie 1 (default) | PAIRED-END | FASTQ | |
5486 | |
5487 sub paired_end_align_fragments_to_bisulfite_genome_fastQ { | |
5488 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
5489 | |
5490 if ($directional){ | |
5491 warn "Input file is $C_to_T_infile_1 (FastQ)\n"; | |
5492 } | |
5493 elsif($pbat){ | |
5494 warn "Input file is $G_to_A_infile_1 (FastQ; PBAT-Seq)\n"; | |
5495 } | |
5496 else{ | |
5497 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 (FastQ)\n"; | |
5498 } | |
5499 | |
5500 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
5501 ## data structure above | |
5502 if ($directional or $pbat){ | |
5503 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5504 } | |
5505 else{ | |
5506 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5507 } | |
5508 | |
5509 foreach my $fh (@fhs) { | |
5510 | |
5511 if ($directional or $pbat){ | |
5512 unless ($fh->{inputfile_1}){ | |
5513 $fh->{last_seq_id} = undef; | |
5514 $fh->{last_line_1} = undef; | |
5515 $fh->{last_line_2} = undef; | |
5516 next; # skipping unwanted filehandles | |
5517 } | |
5518 } | |
5519 | |
5520 my $bt_options = $bowtie_options; | |
5521 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
5522 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
5523 } | |
5524 else { | |
5525 $bt_options .= ' --nofw'; | |
5526 } | |
5527 | |
5528 if ($gzip){ | |
5529 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n"; | |
5530 open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!"; | |
5531 } | |
5532 else{ | |
5533 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n"; | |
5534 sleep(5); | |
5535 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
5536 } | |
5537 | |
5538 my $line_1 = $fh->{fh}->getline(); | |
5539 my $line_2 = $fh->{fh}->getline(); | |
5540 | |
5541 # if Bowtie produces an alignment we store the first line of the output | |
5542 if ($line_1 and $line_2) { | |
5543 chomp $line_1; | |
5544 chomp $line_2; | |
5545 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
5546 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
5547 | |
5548 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
5549 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
5550 | |
5551 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present | |
5552 $fh->{last_seq_id} = $id_1; | |
5553 } | |
5554 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present | |
5555 $fh->{last_seq_id} = $id_2; | |
5556 } | |
5557 else{ | |
5558 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
5559 } | |
5560 | |
5561 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 | |
5562 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 | |
5563 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
5564 } | |
5565 | |
5566 # otherwise we just initialise last_seq_id and last_lines as undefined | |
5567 else { | |
5568 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
5569 $fh->{last_seq_id} = undef; | |
5570 $fh->{last_line_1} = undef; | |
5571 $fh->{last_line_2} = undef; | |
5572 } | |
5573 } | |
5574 } | |
5575 | |
5576 ### Bowtie 2 | PAIRED-END | FASTQ | |
5577 | |
5578 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { | |
5579 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
5580 if ($directional){ | |
5581 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; | |
5582 } | |
5583 else{ | |
5584 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; | |
5585 } | |
5586 | |
5587 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
5588 ## data structure above | |
5589 if ($directional){ | |
5590 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5591 } | |
5592 else{ | |
5593 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5594 } | |
5595 | |
5596 foreach my $fh (@fhs) { | |
5597 | |
5598 if ($directional){ | |
5599 unless ($fh->{inputfile_1}){ | |
5600 $fh->{last_seq_id} = undef; | |
5601 $fh->{last_line_1} = undef; | |
5602 $fh->{last_line_2} = undef; | |
5603 next; | |
5604 } | |
5605 } | |
5606 | |
5607 my $bt2_options = $bowtie_options; | |
5608 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
5609 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
5610 } | |
5611 else { | |
5612 $bt2_options .= ' --nofw'; | |
5613 } | |
5614 | |
5615 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; | |
5616 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
5617 | |
5618 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
5619 while (1){ | |
5620 $_ = $fh->{fh}->getline(); | |
5621 if ($_) { | |
5622 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
5623 } | |
5624 else{ | |
5625 last; # no alignment output | |
5626 } | |
5627 } | |
5628 | |
5629 my $line_1 = $_; | |
5630 my $line_2 = $fh->{fh}->getline(); | |
5631 | |
5632 # if Bowtie produces an alignment we store the first line of the output | |
5633 if ($line_1 and $line_2) { | |
5634 chomp $line_1; | |
5635 chomp $line_2; | |
5636 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
5637 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
5638 | |
5639 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
5640 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
5641 | |
5642 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with | |
5643 $fh->{last_seq_id} = $id_1; | |
5644 } | |
5645 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present | |
5646 $fh->{last_seq_id} = $id_2; | |
5647 } | |
5648 else{ | |
5649 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
5650 } | |
5651 | |
5652 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 | |
5653 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 | |
5654 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
5655 } | |
5656 | |
5657 # otherwise we just initialise last_seq_id and last_lines as undefined | |
5658 else { | |
5659 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
5660 $fh->{last_seq_id} = undef; | |
5661 $fh->{last_line_1} = undef; | |
5662 $fh->{last_line_2} = undef; | |
5663 } | |
5664 } | |
5665 } | |
5666 | |
5667 ##################################################################################################################################################### | |
5668 | |
5669 ### Bowtie 1 (default) | SINGLE-END | FASTA | |
5670 sub single_end_align_fragments_to_bisulfite_genome_fastA { | |
5671 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
5672 if ($directional){ | |
5673 warn "Input file is $C_to_T_infile (FastA)\n"; | |
5674 } | |
5675 else{ | |
5676 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; | |
5677 } | |
5678 | |
5679 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
5680 ## data structure above | |
5681 if ($directional){ | |
5682 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5683 } | |
5684 else{ | |
5685 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5686 } | |
5687 | |
5688 foreach my $fh (@fhs) { | |
5689 | |
5690 my $bt_options = $bowtie_options; | |
5691 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
5692 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
5693 } | |
5694 else { | |
5695 $bt_options .= ' --nofw'; | |
5696 } | |
5697 | |
5698 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; | |
5699 if ($gzip){ | |
5700 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; | |
5701 } | |
5702 else{ | |
5703 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data | |
5704 } | |
5705 | |
5706 # if Bowtie produces an alignment we store the first line of the output | |
5707 $_ = $fh->{fh}->getline(); | |
5708 if ($_) { | |
5709 chomp; | |
5710 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier) | |
5711 $fh->{last_seq_id} = $id; | |
5712 $fh->{last_line} = $_; | |
5713 warn "Found first alignment:\t$fh->{last_line}\n"; | |
5714 } | |
5715 # otherwise we just initialise last_seq_id and last_line as undefined | |
5716 else { | |
5717 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
5718 $fh->{last_seq_id} = undef; | |
5719 $fh->{last_line} = undef; | |
5720 } | |
5721 } | |
5722 } | |
5723 | |
5724 ### Bowtie 2 | SINGLE-END | FASTA | |
5725 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { | |
5726 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
5727 if ($directional){ | |
5728 warn "Input file is $C_to_T_infile (FastA)\n"; | |
5729 } | |
5730 else{ | |
5731 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; | |
5732 } | |
5733 | |
5734 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
5735 ## data structure above | |
5736 if ($directional){ | |
5737 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5738 } | |
5739 else{ | |
5740 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5741 } | |
5742 | |
5743 foreach my $fh (@fhs) { | |
5744 | |
5745 my $bt2_options = $bowtie_options; | |
5746 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
5747 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
5748 } | |
5749 else { | |
5750 $bt2_options .= ' --nofw'; | |
5751 } | |
5752 | |
5753 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n"; | |
5754 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; | |
5755 | |
5756 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
5757 while (1){ | |
5758 $_ = $fh->{fh}->getline(); | |
5759 if ($_) { | |
5760 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
5761 } | |
5762 else{ | |
5763 last; # no alignment output | |
5764 } | |
5765 } | |
5766 | |
5767 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output | |
5768 if ($_) { | |
5769 chomp; | |
5770 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) | |
5771 $fh->{last_seq_id} = $id; | |
5772 $fh->{last_line} = $_; | |
5773 warn "Found first alignment:\t$fh->{last_line}\n"; | |
5774 } | |
5775 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output | |
5776 else { | |
5777 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
5778 $fh->{last_seq_id} = undef; | |
5779 $fh->{last_line} = undef; | |
5780 } | |
5781 } | |
5782 } | |
5783 | |
5784 | |
5785 ### Bowtie 1 (default) | SINGLE-END | FASTQ | |
5786 sub single_end_align_fragments_to_bisulfite_genome_fastQ { | |
5787 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
5788 if ($directional){ | |
5789 warn "Input file is $C_to_T_infile (FastQ)\n"; | |
5790 } | |
5791 elsif($pbat){ | |
5792 warn "Input file is $G_to_A_infile (FastQ)\n"; | |
5793 } | |
5794 else{ | |
5795 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n"; | |
5796 } | |
5797 | |
5798 | |
5799 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
5800 ## the data structure above | |
5801 if ($directional or $pbat){ | |
5802 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5803 } | |
5804 else{ | |
5805 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5806 } | |
5807 | |
5808 foreach my $fh (@fhs) { | |
5809 my $bt_options = $bowtie_options; | |
5810 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
5811 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
5812 } | |
5813 else { | |
5814 $bt_options .= ' --nofw'; | |
5815 } | |
5816 | |
5817 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; | |
5818 sleep (5); | |
5819 | |
5820 if ($gzip){ | |
5821 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; | |
5822 } | |
5823 else{ | |
5824 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data | |
5825 } | |
5826 | |
5827 # if Bowtie produces an alignment we store the first line of the output | |
5828 $_ = $fh->{fh}->getline(); | |
5829 if ($_) { | |
5830 chomp; | |
5831 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) | |
5832 $fh->{last_seq_id} = $id; | |
5833 $fh->{last_line} = $_; | |
5834 warn "Found first alignment:\t$fh->{last_line}\n"; | |
5835 } | |
5836 # otherwise we just initialise last_seq_id and last_line as undefined | |
5837 else { | |
5838 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
5839 $fh->{last_seq_id} = undef; | |
5840 $fh->{last_line} = undef; | |
5841 } | |
5842 } | |
5843 } | |
5844 | |
5845 ### Bowtie 2 | SINGLE-END | FASTQ | |
5846 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { | |
5847 | |
5848 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
5849 if ($directional){ | |
5850 warn "Input file is $C_to_T_infile (FastQ)\n\n"; | |
5851 } | |
5852 else{ | |
5853 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n"; | |
5854 } | |
5855 | |
5856 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
5857 ## the data structure above | |
5858 if ($directional){ | |
5859 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5860 } | |
5861 else{ | |
5862 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
5863 } | |
5864 foreach my $fh (@fhs) { | |
5865 my $bt2_options = $bowtie_options; | |
5866 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
5867 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
5868 } | |
5869 else { | |
5870 $bt2_options .= ' --nofw'; | |
5871 } | |
5872 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n"; | |
5873 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n"; | |
5874 | |
5875 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; | |
5876 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
5877 while (1){ | |
5878 $_ = $fh->{fh}->getline(); | |
5879 # warn "$_\n"; | |
5880 # sleep(1); | |
5881 if ($_) { | |
5882 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
5883 } | |
5884 else { | |
5885 last; | |
5886 } | |
5887 } | |
5888 | |
5889 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output | |
5890 if ($_) { | |
5891 chomp; | |
5892 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier) | |
5893 $fh->{last_seq_id} = $id; | |
5894 $fh->{last_line} = $_; | |
5895 warn "Found first alignment:\t$fh->{last_line}\n"; | |
5896 # warn "storing $id and\n$_\n"; | |
5897 } | |
5898 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output | |
5899 else { | |
5900 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
5901 $fh->{last_seq_id} = undef; | |
5902 $fh->{last_line} = undef; | |
5903 } | |
5904 } | |
5905 } | |
5906 | |
5907 ########################################################################################################################################### | |
5908 | |
5909 sub reset_counters_and_fhs{ | |
5910 my $filename = shift; | |
5911 %counting=( | |
5912 total_meCHH_count => 0, | |
5913 total_meCHG_count => 0, | |
5914 total_meCpG_count => 0, | |
5915 total_meC_unknown_count => 0, | |
5916 total_unmethylated_CHH_count => 0, | |
5917 total_unmethylated_CHG_count => 0, | |
5918 total_unmethylated_CpG_count => 0, | |
5919 total_unmethylated_C_unknown_count => 0, | |
5920 sequences_count => 0, | |
5921 no_single_alignment_found => 0, | |
5922 unsuitable_sequence_count => 0, | |
5923 genomic_sequence_could_not_be_extracted_count => 0, | |
5924 unique_best_alignment_count => 0, | |
5925 low_complexity_alignments_overruled_count => 0, | |
5926 CT_CT_count => 0, #(CT read/CT genome, original top strand) | |
5927 CT_GA_count => 0, #(CT read/GA genome, original bottom strand) | |
5928 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand) | |
5929 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand) | |
5930 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand) | |
5931 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand) | |
5932 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand) | |
5933 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand) | |
5934 alignments_rejected_count => 0, # only relevant if --directional was specified | |
5935 ); | |
5936 | |
5937 if ($directional){ | |
5938 if ($filename =~ ','){ # paired-end files | |
5939 @fhs=( | |
5940 { name => 'CTreadCTgenome', | |
5941 strand_identity => 'con ori forward', | |
5942 bisulfiteIndex => $CT_index_basename, | |
5943 seen => 0, | |
5944 wrong_strand => 0, | |
5945 }, | |
5946 { name => 'CTreadGAgenome', | |
5947 strand_identity => 'con ori reverse', | |
5948 bisulfiteIndex => $GA_index_basename, | |
5949 seen => 0, | |
5950 wrong_strand => 0, | |
5951 }, | |
5952 { name => 'GAreadCTgenome', | |
5953 strand_identity => 'compl ori con forward', | |
5954 bisulfiteIndex => $CT_index_basename, | |
5955 seen => 0, | |
5956 wrong_strand => 0, | |
5957 }, | |
5958 { name => 'GAreadGAgenome', | |
5959 strand_identity => 'compl ori con reverse', | |
5960 bisulfiteIndex => $GA_index_basename, | |
5961 seen => 0, | |
5962 wrong_strand => 0, | |
5963 }, | |
5964 ); | |
5965 } | |
5966 else{ # single-end files | |
5967 @fhs=( | |
5968 { name => 'CTreadCTgenome', | |
5969 strand_identity => 'con ori forward', | |
5970 bisulfiteIndex => $CT_index_basename, | |
5971 seen => 0, | |
5972 wrong_strand => 0, | |
5973 }, | |
5974 { name => 'CTreadGAgenome', | |
5975 strand_identity => 'con ori reverse', | |
5976 bisulfiteIndex => $GA_index_basename, | |
5977 seen => 0, | |
5978 wrong_strand => 0, | |
5979 }, | |
5980 ); | |
5981 } | |
5982 } | |
5983 elsif($pbat){ | |
5984 if ($filename =~ ','){ # paired-end files | |
5985 @fhs=( | |
5986 { name => 'CTreadCTgenome', | |
5987 strand_identity => 'con ori forward', | |
5988 bisulfiteIndex => $CT_index_basename, | |
5989 seen => 0, | |
5990 wrong_strand => 0, | |
5991 }, | |
5992 { name => 'CTreadGAgenome', | |
5993 strand_identity => 'con ori reverse', | |
5994 bisulfiteIndex => $GA_index_basename, | |
5995 seen => 0, | |
5996 wrong_strand => 0, | |
5997 }, | |
5998 { name => 'GAreadCTgenome', | |
5999 strand_identity => 'compl ori con forward', | |
6000 bisulfiteIndex => $CT_index_basename, | |
6001 seen => 0, | |
6002 wrong_strand => 0, | |
6003 }, | |
6004 { name => 'GAreadGAgenome', | |
6005 strand_identity => 'compl ori con reverse', | |
6006 bisulfiteIndex => $GA_index_basename, | |
6007 seen => 0, | |
6008 wrong_strand => 0, | |
6009 }, | |
6010 ); | |
6011 } | |
6012 else{ # single-end files | |
6013 @fhs=( | |
6014 { name => 'GAreadCTgenome', | |
6015 strand_identity => 'compl ori con forward', | |
6016 bisulfiteIndex => $CT_index_basename, | |
6017 seen => 0, | |
6018 wrong_strand => 0, | |
6019 }, | |
6020 { name => 'GAreadGAgenome', | |
6021 strand_identity => 'compl ori con reverse', | |
6022 bisulfiteIndex => $GA_index_basename, | |
6023 seen => 0, | |
6024 wrong_strand => 0, | |
6025 }, | |
6026 ); | |
6027 } | |
6028 } | |
6029 else{ | |
6030 @fhs=( | |
6031 { name => 'CTreadCTgenome', | |
6032 strand_identity => 'con ori forward', | |
6033 bisulfiteIndex => $CT_index_basename, | |
6034 seen => 0, | |
6035 wrong_strand => 0, | |
6036 }, | |
6037 { name => 'CTreadGAgenome', | |
6038 strand_identity => 'con ori reverse', | |
6039 bisulfiteIndex => $GA_index_basename, | |
6040 seen => 0, | |
6041 wrong_strand => 0, | |
6042 }, | |
6043 { name => 'GAreadCTgenome', | |
6044 strand_identity => 'compl ori con forward', | |
6045 bisulfiteIndex => $CT_index_basename, | |
6046 seen => 0, | |
6047 wrong_strand => 0, | |
6048 }, | |
6049 { name => 'GAreadGAgenome', | |
6050 strand_identity => 'compl ori con reverse', | |
6051 bisulfiteIndex => $GA_index_basename, | |
6052 seen => 0, | |
6053 wrong_strand => 0, | |
6054 }, | |
6055 ); | |
6056 } | |
6057 } | |
6058 | |
6059 | |
6060 sub process_command_line{ | |
6061 my @bowtie_options; | |
6062 my $help; | |
6063 my $mates1; | |
6064 my $mates2; | |
6065 my $path_to_bowtie; | |
6066 my $fastq; | |
6067 my $fasta; | |
6068 my $skip; | |
6069 my $qupto; | |
6070 my $phred64; | |
6071 my $phred33; | |
6072 my $solexa; | |
6073 my $mismatches; | |
6074 my $seed_length; | |
6075 my $best; | |
6076 my $sequence_format; | |
6077 my $version; | |
6078 my $quiet; | |
6079 my $chunk; | |
6080 my $non_directional; | |
6081 my $ceiling; | |
6082 my $maxins; | |
6083 my $minins; | |
6084 my $unmapped; | |
6085 my $multi_map; | |
6086 my $output_dir; | |
6087 my $bowtie2; | |
6088 my $vanilla; | |
6089 my $sam_no_hd; | |
6090 my $seed_extension_fails; | |
6091 my $reseed_repetitive_seeds; | |
6092 my $most_valid_alignments; | |
6093 my $score_min; | |
6094 my $parallel; | |
6095 my $temp_dir; | |
6096 my $rdg; | |
6097 my $rfg; | |
6098 my $non_bs_mm; | |
6099 my $samtools_path; | |
6100 my $bam; | |
6101 my $gzip; | |
6102 my $pbat; | |
6103 my $prefix; | |
6104 my $old_flag; | |
6105 | |
6106 my $command_line = GetOptions ('help|man' => \$help, | |
6107 '1=s' => \$mates1, | |
6108 '2=s' => \$mates2, | |
6109 'path_to_bowtie=s' => \$path_to_bowtie, | |
6110 'f|fasta' => \$fasta, | |
6111 'q|fastq' => \$fastq, | |
6112 's|skip=i' => \$skip, | |
6113 'u|upto=i' => \$qupto, | |
6114 'phred33-quals' => \$phred33, | |
6115 'phred64-quals|solexa1' => \$phred64, | |
6116 'solexa-quals' => \$solexa, | |
6117 'n|seedmms=i' => \$mismatches, | |
6118 'l|seedlen=i' => \$seed_length, | |
6119 'no_best' => \$best, | |
6120 'version' => \$version, | |
6121 'quiet' => \$quiet, | |
6122 'chunkmbs=i' => \$chunk, | |
6123 'non_directional' => \$non_directional, | |
6124 'I|minins=i' => \$minins, | |
6125 'X|maxins=i' => \$maxins, | |
6126 'e|maqerr=i' => \$ceiling, | |
6127 'un|unmapped' => \$unmapped, | |
6128 'ambiguous' => \$multi_map, | |
6129 'o|output_dir=s' => \$output_dir, | |
6130 'bowtie2' => \$bowtie2, | |
6131 'vanilla' => \$vanilla, | |
6132 'sam-no-hd' => \$sam_no_hd, | |
6133 'D=i' => \$seed_extension_fails, | |
6134 'R=i' => \$reseed_repetitive_seeds, | |
6135 'score_min=s' => \$score_min, | |
6136 'most_valid_alignments=i' => \$most_valid_alignments, | |
6137 'p=i' => \$parallel, | |
6138 'temp_dir=s' => \$temp_dir, | |
6139 'rdg=s' => \$rdg, | |
6140 'rfg=s' => \$rfg, | |
6141 'non_bs_mm' => \$non_bs_mm, | |
6142 'samtools_path=s' => \$samtools_path, | |
6143 'bam' => \$bam, | |
6144 'gzip' => \$gzip, | |
6145 'pbat' => \$pbat, | |
6146 'prefix=s' => \$prefix, | |
6147 'old_flag' => \$old_flag, | |
6148 ); | |
6149 | |
6150 | |
6151 ### EXIT ON ERROR if there were errors with any of the supplied options | |
6152 unless ($command_line){ | |
6153 die "Please respecify command line options\n"; | |
6154 } | |
6155 ### HELPFILE | |
6156 if ($help){ | |
6157 print_helpfile(); | |
6158 exit; | |
6159 } | |
6160 if ($version){ | |
6161 print << "VERSION"; | |
6162 | |
6163 | |
6164 Bismark - Bisulfite Mapper and Methylation Caller. | |
6165 | |
6166 Bismark Version: $bismark_version | |
6167 Copyright 2010-13 Felix Krueger, Babraham Bioinformatics | |
6168 www.bioinformatics.babraham.ac.uk/projects/ | |
6169 | |
6170 | |
6171 VERSION | |
6172 exit; | |
6173 } | |
6174 | |
6175 | |
6176 ########################## | |
6177 ### PROCESSING OPTIONS ### | |
6178 ########################## | |
6179 | |
6180 unless ($bowtie2){ | |
6181 $bowtie2 = 0; | |
6182 } | |
6183 unless ($sam_no_hd){ | |
6184 $sam_no_hd =0; | |
6185 } | |
6186 | |
6187 ### PATH TO BOWTIE | |
6188 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH | |
6189 if ($path_to_bowtie){ | |
6190 unless ($path_to_bowtie =~ /\/$/){ | |
6191 $path_to_bowtie =~ s/$/\//; | |
6192 } | |
6193 if (-d $path_to_bowtie){ | |
6194 if ($bowtie2){ | |
6195 $path_to_bowtie = "${path_to_bowtie}bowtie2"; | |
6196 } | |
6197 else{ | |
6198 $path_to_bowtie = "${path_to_bowtie}bowtie"; | |
6199 } | |
6200 } | |
6201 else{ | |
6202 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n"; | |
6203 } | |
6204 } | |
6205 else{ | |
6206 if ($bowtie2){ | |
6207 $path_to_bowtie = 'bowtie2'; | |
6208 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; } | |
6209 else{ | |
6210 $path_to_bowtie = 'bowtie'; | |
6211 warn "Path to Bowtie specified as: $path_to_bowtie\n"; | |
6212 } | |
6213 } | |
6214 | |
6215 ### OUTPUT REQUESTED AS BAM FILE | |
6216 if ($bam){ | |
6217 if ($vanilla){ | |
6218 die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n"; | |
6219 } | |
6220 | |
6221 ### PATH TO SAMTOOLS | |
6222 if (defined $samtools_path){ | |
6223 # if Samtools was specified as full command | |
6224 if ($samtools_path =~ /samtools$/){ | |
6225 if (-e $samtools_path){ | |
6226 # Samtools executable found | |
6227 } | |
6228 else{ | |
6229 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
6230 } | |
6231 } | |
6232 else{ | |
6233 unless ($samtools_path =~ /\/$/){ | |
6234 $samtools_path =~ s/$/\//; | |
6235 } | |
6236 $samtools_path .= 'samtools'; | |
6237 if (-e $samtools_path){ | |
6238 # Samtools executable found | |
6239 } | |
6240 else{ | |
6241 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
6242 } | |
6243 } | |
6244 | |
6245 warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n"; | |
6246 $bam = 1; | |
6247 } | |
6248 # Check whether Samtools is in the PATH if no path was supplied by the user | |
6249 else{ | |
6250 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH | |
6251 $samtools_path = `which samtools`; | |
6252 chomp $samtools_path; | |
6253 warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n"; | |
6254 $bam = 1; | |
6255 } | |
6256 } | |
6257 | |
6258 unless (defined $samtools_path){ | |
6259 $bam = 2; | |
6260 warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n"; | |
6261 } | |
6262 sleep (1); | |
6263 } | |
6264 | |
6265 | |
6266 #################################### | |
6267 ### PROCESSING ARGUMENTS | |
6268 | |
6269 ### GENOME FOLDER | |
6270 my $genome_folder = shift @ARGV; # mandatory | |
6271 unless ($genome_folder){ | |
6272 warn "Genome folder was not specified!\n"; | |
6273 print_helpfile(); | |
6274 exit; | |
6275 } | |
6276 | |
6277 ### checking that the genome folder, all subfolders and the required bowtie index files exist | |
6278 unless ($genome_folder =~/\/$/){ | |
6279 $genome_folder =~ s/$/\//; | |
6280 } | |
6281 | |
6282 if (chdir $genome_folder){ | |
6283 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute | |
6284 unless ($absolute_genome_folder =~/\/$/){ | |
6285 $absolute_genome_folder =~ s/$/\//; | |
6286 } | |
6287 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n"; | |
6288 $genome_folder = $absolute_genome_folder; | |
6289 } | |
6290 else{ | |
6291 die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n"; | |
6292 } | |
6293 | |
6294 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/"; | |
6295 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/"; | |
6296 | |
6297 if ($bowtie2){ ### Bowtie 2 (new) | |
6298 ### checking the integrity of $CT_dir | |
6299 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | |
6300 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); | |
6301 foreach my $file(@CT_bowtie_index){ | |
6302 unless (-f $file){ | |
6303 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run the bismark_genome_preparation before running Bismark\n"; | |
6304 } | |
6305 } | |
6306 ### checking the integrity of $GA_dir | |
6307 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | |
6308 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); | |
6309 foreach my $file(@GA_bowtie_index){ | |
6310 unless (-f $file){ | |
6311 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark\n"; | |
6312 } | |
6313 } | |
6314 } | |
6315 | |
6316 else{ ### Bowtie 1 (default) | |
6317 ### checking the integrity of $CT_dir | |
6318 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | |
6319 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); | |
6320 foreach my $file(@CT_bowtie_index){ | |
6321 unless (-f $file){ | |
6322 die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; | |
6323 } | |
6324 } | |
6325 ### checking the integrity of $GA_dir | |
6326 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | |
6327 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); | |
6328 foreach my $file(@GA_bowtie_index){ | |
6329 unless (-f $file){ | |
6330 die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; | |
6331 } | |
6332 } | |
6333 } | |
6334 | |
6335 my $CT_index_basename = "${CT_dir}BS_CT"; | |
6336 my $GA_index_basename = "${GA_dir}BS_GA"; | |
6337 | |
6338 ### INPUT OPTIONS | |
6339 | |
6340 ### SEQUENCE FILE FORMAT | |
6341 ### exits if both fastA and FastQ were specified | |
6342 if ($fasta and $fastq){ | |
6343 die "Only one sequence filetype can be specified (fastA or fastQ)\n"; | |
6344 } | |
6345 | |
6346 ### unless fastA is specified explicitely, fastQ sequence format is expected by default | |
6347 if ($fasta){ | |
6348 print "FastA format specified\n"; | |
6349 $sequence_format = 'FASTA'; | |
6350 push @bowtie_options, '-f'; | |
6351 } | |
6352 elsif ($fastq){ | |
6353 print "FastQ format specified\n"; | |
6354 $sequence_format = 'FASTQ'; | |
6355 push @bowtie_options, '-q'; | |
6356 } | |
6357 else{ | |
6358 $fastq = 1; | |
6359 print "FastQ format assumed (by default)\n"; | |
6360 $sequence_format = 'FASTQ'; | |
6361 push @bowtie_options, '-q'; | |
6362 } | |
6363 | |
6364 ### SKIP | |
6365 if ($skip){ | |
6366 warn "Skipping the first $skip reads from the input file\n"; | |
6367 # push @bowtie_options,"-s $skip"; | |
6368 } | |
6369 | |
6370 ### UPTO | |
6371 if ($qupto){ | |
6372 warn "Processing sequences up to read no. $qupto from the input file\n"; | |
6373 if ($bowtie2){ | |
6374 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2 | |
6375 } | |
6376 else{ | |
6377 # push @bowtie_options,"--qupto $qupto"; | |
6378 } | |
6379 } | |
6380 | |
6381 ### QUALITY VALUES | |
6382 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){ | |
6383 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)"; | |
6384 } | |
6385 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2. | |
6386 # Phred quality values work only when -q is specified | |
6387 unless ($fastq){ | |
6388 die "Phred quality values works only when -q (FASTQ) is specified\n"; | |
6389 } | |
6390 if ($bowtie2){ | |
6391 push @bowtie_options,"--phred33"; | |
6392 } | |
6393 else{ | |
6394 push @bowtie_options,"--phred33-quals"; | |
6395 } | |
6396 } | |
6397 if ($phred64){ | |
6398 # Phred quality values work only when -q is specified | |
6399 unless ($fastq){ | |
6400 die "Phred quality values work only when -q (FASTQ) is specified\n"; | |
6401 } | |
6402 if ($bowtie2){ | |
6403 push @bowtie_options,"--phred64"; | |
6404 } | |
6405 else{ | |
6406 push @bowtie_options,"--phred64-quals"; | |
6407 } | |
6408 } | |
6409 else{ | |
6410 $phred64 = 0; | |
6411 } | |
6412 | |
6413 if ($solexa){ | |
6414 if ($bowtie2){ | |
6415 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n"; | |
6416 } | |
6417 # Solexa to Phred value conversion works only when -q is specified | |
6418 unless ($fastq){ | |
6419 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n"; | |
6420 } | |
6421 push @bowtie_options,"--solexa-quals"; | |
6422 } | |
6423 else{ | |
6424 $solexa = 0; | |
6425 } | |
6426 | |
6427 ### ALIGNMENT OPTIONS | |
6428 | |
6429 ### MISMATCHES | |
6430 if (defined $mismatches){ | |
6431 if ($bowtie2){ | |
6432 if ($mismatches == 0 or $mismatches == 1){ | |
6433 push @bowtie_options,"-N $mismatches"; | |
6434 } | |
6435 else{ | |
6436 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n"; | |
6437 } | |
6438 } | |
6439 else{ | |
6440 if ($mismatches >= 0 and $mismatches <= 3){ | |
6441 push @bowtie_options,"-n $mismatches"; | |
6442 } | |
6443 else{ | |
6444 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n"; | |
6445 } | |
6446 } | |
6447 } | |
6448 else{ | |
6449 unless ($bowtie2){ | |
6450 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2 | |
6451 } | |
6452 } | |
6453 | |
6454 ### SEED LENGTH | |
6455 if (defined $seed_length){ | |
6456 if ($bowtie2){ | |
6457 push @bowtie_options,"-L $seed_length"; | |
6458 } | |
6459 else{ | |
6460 push @bowtie_options,"-l $seed_length"; | |
6461 } | |
6462 } | |
6463 | |
6464 ### MISMATCH CEILING | |
6465 if (defined $ceiling){ | |
6466 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2); | |
6467 push @bowtie_options,"-e $ceiling"; | |
6468 } | |
6469 | |
6470 | |
6471 ### BOWTIE 2 EFFORT OPTIONS | |
6472 | |
6473 ### CONSECUTIVE SEED EXTENSION FAILS | |
6474 if (defined $seed_extension_fails){ | |
6475 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
6476 push @bowtie_options,"-D $seed_extension_fails"; | |
6477 } | |
6478 | |
6479 ### RE-SEEDING REPETITIVE SEEDS | |
6480 if (defined $reseed_repetitive_seeds){ | |
6481 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
6482 push @bowtie_options,"-R $reseed_repetitive_seeds"; | |
6483 } | |
6484 | |
6485 | |
6486 ### BOWTIE 2 SCORING OPTIONS | |
6487 if ($score_min){ | |
6488 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
6489 unless ($score_min =~ /^L,.+,.+$/){ | |
6490 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; | |
6491 } | |
6492 push @bowtie_options,"--score-min $score_min"; | |
6493 } | |
6494 else{ | |
6495 if ($bowtie2){ | |
6496 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2 | |
6497 } | |
6498 } | |
6499 | |
6500 ### BOWTIE 2 READ GAP OPTIONS | |
6501 my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend); | |
6502 | |
6503 if ($rdg){ | |
6504 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
6505 if ($rdg =~ /^(\d+),(\d+)$/){ | |
6506 $deletion_open = $1; | |
6507 $deletion_extend = $2; | |
6508 } | |
6509 else{ | |
6510 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; | |
6511 } | |
6512 push @bowtie_options,"--rdg $rdg"; | |
6513 } | |
6514 else{ | |
6515 $deletion_open = 5; | |
6516 $deletion_extend = 3; | |
6517 } | |
6518 | |
6519 ### BOWTIE 2 REFERENCE GAP OPTIONS | |
6520 if ($rfg){ | |
6521 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
6522 if ($rfg =~ /^(\d+),(\d+)$/){ | |
6523 $insertion_open = $1; | |
6524 $insertion_extend = $2; | |
6525 } | |
6526 else{ | |
6527 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; | |
6528 } | |
6529 push @bowtie_options,"--rfg $rfg"; | |
6530 } | |
6531 else{ | |
6532 $insertion_open = 5; | |
6533 $insertion_extend = 3; | |
6534 } | |
6535 | |
6536 | |
6537 ### BOWTIE 2 PARALLELIZATION OPTIONS | |
6538 if (defined $parallel){ | |
6539 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2); | |
6540 } | |
6541 if ($bowtie2){ | |
6542 if ($parallel){ | |
6543 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); | |
6544 push @bowtie_options,"-p $parallel"; | |
6545 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work. | |
6546 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n"; | |
6547 sleep (2); | |
6548 } | |
6549 } | |
6550 | |
6551 ### REPORTING OPTIONS | |
6552 | |
6553 if ($bowtie2){ | |
6554 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default | |
6555 | |
6556 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while | |
6557 if(defined $most_valid_alignments){ | |
6558 | |
6559 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n"; | |
6560 # push @bowtie_options,"-M $most_valid_alignments";sleep (5); | |
6561 } | |
6562 # else{ | |
6563 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence | |
6564 # } | |
6565 } | |
6566 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1 | |
6567 push @bowtie_options,'-k 2'; | |
6568 } | |
6569 | |
6570 ### --BEST | |
6571 if ($bowtie2){ | |
6572 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used | |
6573 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n"; | |
6574 } | |
6575 } | |
6576 else{ | |
6577 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process) | |
6578 unless ($best){ | |
6579 push @bowtie_options,'--best'; | |
6580 } | |
6581 } | |
6582 | |
6583 ### VANILLA BISMARK (BOWTIE 1) OUTPUT | |
6584 if ($vanilla){ | |
6585 if ($bowtie2){ | |
6586 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n"; | |
6587 } | |
6588 } | |
6589 else{ | |
6590 $vanilla = 0; | |
6591 } | |
6592 | |
6593 ### PAIRED-END MAPPING | |
6594 if ($mates1){ | |
6595 my @mates1 = (split (/,/,$mates1)); | |
6596 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2); | |
6597 my @mates2 = (split(/,/,$mates2)); | |
6598 unless (scalar @mates1 == scalar @mates2){ | |
6599 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n"; | |
6600 } | |
6601 while (1){ | |
6602 my $mate1 = shift @mates1; | |
6603 my $mate2 = shift @mates2; | |
6604 last unless ($mate1 and $mate2); | |
6605 push @filenames,"$mate1,$mate2"; | |
6606 } | |
6607 if ($bowtie2){ | |
6608 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments | |
6609 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones | |
6610 } | |
6611 | |
6612 if ($old_flag){ | |
6613 warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); | |
6614 sleep(3); | |
6615 } | |
6616 } | |
6617 elsif ($mates2){ | |
6618 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n"; | |
6619 } | |
6620 | |
6621 ### SINGLE-END MAPPING | |
6622 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified | |
6623 my $singles; | |
6624 unless ($mates1 and $mates2){ | |
6625 $singles = join (',',@ARGV); | |
6626 unless ($singles){ | |
6627 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n"; | |
6628 } | |
6629 $singles =~ s/\s/,/g; | |
6630 @filenames = (split(/,/,$singles)); | |
6631 warn "\nFiles to be analysed:\n"; | |
6632 warn "@filenames\n\n"; | |
6633 sleep (3); | |
6634 } | |
6635 | |
6636 ### MININUM INSERT SIZE (PAIRED-END ONLY) | |
6637 if (defined $minins){ | |
6638 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles); | |
6639 push @bowtie_options,"--minins $minins"; | |
6640 } | |
6641 | |
6642 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY) | |
6643 if (defined $maxins){ | |
6644 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles); | |
6645 push @bowtie_options,"--maxins $maxins"; | |
6646 } | |
6647 else{ | |
6648 unless ($singles){ | |
6649 push @bowtie_options,'--maxins 500'; | |
6650 } | |
6651 } | |
6652 | |
6653 ### QUIET prints nothing besides alignments (suppresses warnings) | |
6654 if ($quiet){ | |
6655 push @bowtie_options,'--quiet'; | |
6656 } | |
6657 | |
6658 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments | |
6659 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option | |
6660 if (defined $chunk){ | |
6661 push @bowtie_options,"--chunkmbs $chunk"; | |
6662 } | |
6663 else{ | |
6664 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default) | |
6665 } | |
6666 } | |
6667 | |
6668 | |
6669 ### SUMMARY OF ALL BOWTIE OPTIONS | |
6670 my $bowtie_options = join (' ',@bowtie_options); | |
6671 | |
6672 | |
6673 ### STRAND-SPECIFIC LIBRARIES | |
6674 my $directional; | |
6675 if ($non_directional){ | |
6676 die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat); | |
6677 warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n"; | |
6678 sleep (3); | |
6679 $directional = 0; | |
6680 } | |
6681 elsif($pbat){ | |
6682 die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip); | |
6683 die "The option --pbat is currently not working for Bowtie 2. Please run alignments in default (i.e. Bowtie 1) mode!\n" if ($bowtie2); | |
6684 die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta); | |
6685 | |
6686 warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n"; | |
6687 sleep (3); | |
6688 $directional = 0; | |
6689 } | |
6690 else{ | |
6691 warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n"; | |
6692 sleep (3); | |
6693 $directional = 1; # default behaviour | |
6694 } | |
6695 | |
6696 ### UNMAPPED SEQUENCE OUTPUT | |
6697 $unmapped = 0 unless ($unmapped); | |
6698 | |
6699 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT | |
6700 $multi_map = 0 unless ($multi_map); | |
6701 | |
6702 | |
6703 ### OUTPUT DIRECTORY | |
6704 | |
6705 chdir $parent_dir or die "Failed to move back to current working directory\n"; | |
6706 if ($output_dir){ | |
6707 unless ($output_dir =~ /\/$/){ | |
6708 $output_dir =~ s/$/\//; | |
6709 } | |
6710 | |
6711 if (chdir $output_dir){ | |
6712 $output_dir = getcwd; # making the path absolute | |
6713 unless ($output_dir =~ /\/$/){ | |
6714 $output_dir =~ s/$/\//; | |
6715 } | |
6716 } | |
6717 else{ | |
6718 mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; | |
6719 warn "Created output directory $output_dir!\n\n"; | |
6720 chdir $output_dir or die "Failed to move to $output_dir\n"; | |
6721 $output_dir = getcwd; # making the path absolute | |
6722 unless ($output_dir =~ /\/$/){ | |
6723 $output_dir =~ s/$/\//; | |
6724 } | |
6725 } | |
6726 warn "Output will be written into the directory: $output_dir\n"; | |
6727 } | |
6728 else{ | |
6729 $output_dir = ''; | |
6730 } | |
6731 | |
6732 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files | |
6733 | |
6734 chdir $parent_dir or die "Failed to move back to current working directory\n"; | |
6735 if ($temp_dir){ | |
6736 warn "\nUsing temp directory: $temp_dir\n"; | |
6737 unless ($temp_dir =~ /\/$/){ | |
6738 $temp_dir =~ s/$/\//; | |
6739 } | |
6740 | |
6741 if (chdir $temp_dir){ | |
6742 $temp_dir = getcwd; # making the path absolute | |
6743 unless ($temp_dir =~ /\/$/){ | |
6744 $temp_dir =~ s/$/\//; | |
6745 } | |
6746 } | |
6747 else{ | |
6748 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n"; | |
6749 warn "Created temporary directory $temp_dir!\n\n"; | |
6750 chdir $temp_dir or die "Failed to move to $temp_dir\n"; | |
6751 $temp_dir = getcwd; # making the path absolute | |
6752 unless ($temp_dir =~ /\/$/){ | |
6753 $temp_dir =~ s/$/\//; | |
6754 } | |
6755 } | |
6756 warn "Temporary files will be written into the directory: $temp_dir\n"; | |
6757 } | |
6758 else{ | |
6759 $temp_dir = ''; | |
6760 } | |
6761 | |
6762 ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE | |
6763 if ($non_bs_mm){ | |
6764 if ($vanilla){ | |
6765 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; | |
6766 } | |
6767 } | |
6768 | |
6769 ### PREFIX FOR OUTPUT FILES | |
6770 if ($prefix){ | |
6771 # removing trailing dots | |
6772 | |
6773 $prefix =~ s/\.+$//; | |
6774 | |
6775 warn "Using the following prefix for output files: $prefix\n\n"; | |
6776 sleep(1); | |
6777 } | |
6778 | |
6779 | |
6780 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag); | |
6781 } | |
6782 | |
6783 | |
6784 | |
6785 sub generate_SAM_header{ | |
6786 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order | |
6787 foreach my $chr (keys %chromosomes){ | |
6788 my $length = length ($chromosomes{$chr}); | |
6789 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length | |
6790 } | |
6791 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version | |
6792 } | |
6793 | |
6794 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format: | |
6795 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011) | |
6796 | |
6797 sub single_end_SAM_output{ | |
6798 my ($id,$actual_seq,$methylation_call_params,$qual) = @_; | |
6799 my $strand = $methylation_call_params->{$id}->{alignment_strand}; | |
6800 my $chr = $methylation_call_params->{$id}->{chromosome}; | |
6801 my $start = $methylation_call_params->{$id}->{position}; | |
6802 my $stop = $methylation_call_params->{$id}->{end_position}; | |
6803 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence}; | |
6804 my $methcall = $methylation_call_params->{$id}->{methylation_call}; | |
6805 my $read_conversion = $methylation_call_params->{$id}->{read_conversion}; | |
6806 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; | |
6807 my $number_of_mismatches; | |
6808 if ($bowtie2){ | |
6809 $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score}; | |
6810 } | |
6811 else{ | |
6812 $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches}; | |
6813 } | |
6814 | |
6815 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" | |
6816 ## FLAG: bitwise FLAG. Each bit is explained in the following table: | |
6817 ## Bit Description Comment Value | |
6818 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1) | |
6819 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2) | |
6820 ## 0x4 segment unmapped --- --- | |
6821 ## 0x8 next segment in the template unmapped --- --- | |
6822 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16) | |
6823 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32) | |
6824 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64) | |
6825 ## 0x80 the last segment in the template read 2 value: 2**7 (128) | |
6826 ## 0x100 secondary alignment --- --- | |
6827 ## 0x200 not passing quality controls --- --- | |
6828 ## 0x400 PCR or optical duplicate --- --- | |
6829 | |
6830 ##### | |
6831 | |
6832 my $flag; # FLAG variable used for SAM format. | |
6833 if ($strand eq "+"){ | |
6834 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ | |
6835 $flag = 0; # 0 for "+" strand (OT) | |
6836 } | |
6837 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ | |
6838 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand) | |
6839 } | |
6840 else{ | |
6841 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; | |
6842 } | |
6843 } | |
6844 elsif ($strand eq "-"){ | |
6845 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ | |
6846 $flag = 16; # 16 for "-" strand (OB) | |
6847 } | |
6848 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ | |
6849 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand) | |
6850 } | |
6851 else{ | |
6852 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; | |
6853 } | |
6854 } | |
6855 else{ | |
6856 die "Unexpected strand information: $strand\n\n"; | |
6857 } | |
6858 | |
6859 ##### | |
6860 | |
6861 my $mapq = 255; # Assume mapping quality is unavailable | |
6862 | |
6863 ##### | |
6864 | |
6865 my $cigar; | |
6866 if ($bowtie2){ | |
6867 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2 | |
6868 } | |
6869 else{ | |
6870 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches) | |
6871 } | |
6872 | |
6873 ##### | |
6874 | |
6875 my $rnext = "*"; # Paired-end variable | |
6876 | |
6877 ##### | |
6878 | |
6879 my $pnext = 0; # Paired-end variable | |
6880 | |
6881 ##### | |
6882 | |
6883 my $tlen = 0; # Paired-end variable | |
6884 | |
6885 ##### | |
6886 | |
6887 if ($read_conversion eq 'CT'){ | |
6888 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands | |
6889 } | |
6890 else{ | |
6891 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries | |
6892 } | |
6893 | |
6894 if ($strand eq '-'){ | |
6895 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand | |
6896 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence | |
6897 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well | |
6898 } | |
6899 | |
6900 ##### | |
6901 | |
6902 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string | |
6903 # into the reference string. hemming_dist() | |
6904 if ($bowtie2){ | |
6905 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence | |
6906 } | |
6907 | |
6908 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences | |
6909 | |
6910 ##### | |
6911 | |
6912 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!) | |
6913 | |
6914 ##### | |
6915 | |
6916 my $XM_tag; # Optional tag XM: Methylation Call String | |
6917 if ($strand eq '+'){ | |
6918 $XM_tag = "XM:Z:$methcall"; | |
6919 } | |
6920 elsif ($strand eq '-'){ | |
6921 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well | |
6922 } | |
6923 | |
6924 ##### | |
6925 | |
6926 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion | |
6927 | |
6928 ##### | |
6929 | |
6930 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion | |
6931 | |
6932 ##### | |
6933 | |
6934 # Optionally calculating number of mismatches for Bowtie 2 alignments | |
6935 | |
6936 if ($non_bs_mm) { | |
6937 if ($bowtie2) { | |
6938 | |
6939 $number_of_mismatches =~ s/-//; # removing the minus sign | |
6940 | |
6941 ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches | |
6942 if ($cigar =~ /(D|I)/) { | |
6943 # warn "$cigar\n"; | |
6944 | |
6945 # parsing CIGAR string | |
6946 my @len = split (/\D+/,$cigar); # storing the length per operation | |
6947 my @ops = split (/\d+/,$cigar); # storing the operation | |
6948 shift @ops; # remove the empty first element | |
6949 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
6950 | |
6951 foreach (0..$#len) { | |
6952 if ($ops[$_] eq 'M') { | |
6953 # warn "skipping\n"; | |
6954 next; # irrelevant | |
6955 } | |
6956 elsif ($ops[$_] eq 'I') { # insertion in the read sequence | |
6957 $number_of_mismatches -= $insertion_open; | |
6958 $number_of_mismatches -= $len[$_] * $insertion_extend; | |
6959 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; | |
6960 } | |
6961 elsif ($ops[$_] eq 'D') { # deletion in the read sequence | |
6962 $number_of_mismatches -= $deletion_open; | |
6963 $number_of_mismatches -= $len[$_] * $deletion_extend; | |
6964 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; | |
6965 } | |
6966 elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die | |
6967 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
6968 } | |
6969 else { | |
6970 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
6971 } | |
6972 } | |
6973 # warn "Alignment score $number_of_mismatches\n"; | |
6974 # print "Mismatches $number_of_mismatches\n\n"; | |
6975 } | |
6976 ### Now we have InDel corrected alignment scores | |
6977 | |
6978 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the | |
6979 ### sequence contained more than 5 Ns, but this should occur close to never | |
6980 | |
6981 my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division | |
6982 # warn "N count: $seq_N_count\n"; | |
6983 $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count; | |
6984 # warn "MM $number_of_mismatches\n"; | |
6985 } | |
6986 } | |
6987 | |
6988 #### | |
6989 | |
6990 my $XA_tag = "XA:Z:$number_of_mismatches"; | |
6991 | |
6992 ##### | |
6993 | |
6994 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields | |
6995 ### optionally print number of non-bisulfite mismatches | |
6996 if ($non_bs_mm){ | |
6997 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n"; | |
6998 } | |
6999 else{ # default | |
7000 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields | |
7001 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n"; | |
7002 } | |
7003 } | |
7004 | |
7005 sub paired_end_SAM_output{ | |
7006 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_; | |
7007 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand | |
7008 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2}; | |
7009 my $chr = $methylation_call_params->{$id}->{chromosome}; | |
7010 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1}; | |
7011 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2}; | |
7012 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1}; | |
7013 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; | |
7014 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; | |
7015 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; | |
7016 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; | |
7017 | |
7018 my $id_1; | |
7019 my $id_2; | |
7020 | |
7021 if ($old_flag){ | |
7022 $id_1 = $id.'/1'; | |
7023 $id_2 = $id.'/2'; | |
7024 } | |
7025 else{ | |
7026 $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard | |
7027 $id_2 = $id; | |
7028 } | |
7029 | |
7030 # Allows all degenerate nucleotide sequences in reference genome | |
7031 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i; | |
7032 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i; | |
7033 | |
7034 my $index; # used to store the srand origin of the alignment in a less convoluted way | |
7035 | |
7036 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){ | |
7037 $index = 0; ## this is OT (original top strand) | |
7038 } | |
7039 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){ | |
7040 $index = 1; ## this is CTOB (complementary to OB) | |
7041 } | |
7042 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){ | |
7043 $index = 2; ## this is CTOT (complementary to OT) | |
7044 } | |
7045 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){ | |
7046 $index = 3; ## this is OB (original bottom) | |
7047 } | |
7048 else { | |
7049 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n"; | |
7050 } | |
7051 | |
7052 my $number_of_mismatches_1; | |
7053 my $number_of_mismatches_2; | |
7054 | |
7055 if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine | |
7056 $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default! | |
7057 $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2}; | |
7058 } | |
7059 else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation | |
7060 if ($index == 2 or $index == 3){ # CTOT or OB | |
7061 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default! | |
7062 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1}; | |
7063 } | |
7064 else{ # if the first read aligned in forward direction it is like for Bowtie 2 | |
7065 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default! | |
7066 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2}; | |
7067 } | |
7068 } | |
7069 | |
7070 | |
7071 | |
7072 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the | |
7073 ### first or last position. | |
7074 | |
7075 if ($index == 0 or $index == 3){ # OT or OB | |
7076 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2); | |
7077 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2); | |
7078 } | |
7079 else{ # CTOT or CTOB | |
7080 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2); | |
7081 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2); | |
7082 } | |
7083 | |
7084 ##### | |
7085 | |
7086 my $start_read_1; | |
7087 my $start_read_2; | |
7088 # adjusting end positions | |
7089 | |
7090 if ($bowtie2){ | |
7091 $start_read_1 = $methylation_call_params->{$id}->{position_1}; | |
7092 $start_read_2 = $methylation_call_params->{$id}->{position_2}; | |
7093 } | |
7094 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 | |
7095 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand | |
7096 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1}; | |
7097 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1; | |
7098 } | |
7099 else{ # read 1 is on the - strand | |
7100 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1; | |
7101 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1}; | |
7102 } | |
7103 } | |
7104 | |
7105 ##### | |
7106 | |
7107 my $end_read_1; | |
7108 my $end_read_2; | |
7109 # adjusting end positions | |
7110 | |
7111 if ($bowtie2){ | |
7112 $end_read_1 = $methylation_call_params->{$id}->{end_position_1}; | |
7113 $end_read_2 = $methylation_call_params->{$id}->{end_position_2}; | |
7114 } | |
7115 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 | |
7116 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand | |
7117 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1; | |
7118 $end_read_2 = $methylation_call_params->{$id}->{alignment_end}; | |
7119 } | |
7120 else{ | |
7121 $end_read_1 = $methylation_call_params->{$id}->{alignment_end}; | |
7122 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1; | |
7123 } | |
7124 } | |
7125 | |
7126 ##### | |
7127 | |
7128 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" | |
7129 ## FLAG: bitwise FLAG. Each bit is explained in the following table: | |
7130 ## Bit Description Comment Value | |
7131 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1) | |
7132 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2) | |
7133 ## 0x4 segment unmapped --- --- | |
7134 ## 0x8 next segment in the template unmapped --- --- | |
7135 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16) | |
7136 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32) | |
7137 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64) | |
7138 ## 0x80 the last segment in the template read 2 value: 2^^7 (128) | |
7139 ## 0x100 secondary alignment --- --- | |
7140 ## 0x200 not passing quality controls --- --- | |
7141 ## 0x400 PCR or optical duplicate --- --- | |
7142 | |
7143 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account | |
7144 | |
7145 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) | |
7146 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) | |
7147 | |
7148 my $flag_1; # FLAG variable used for SAM format | |
7149 my $flag_2; | |
7150 | |
7151 ### The new default FLAG values have been suggested by Peter Hickey, Australia (PH) | |
7152 | |
7153 if ($index == 0){ # OT | |
7154 unless ($old_flag){ | |
7155 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) | |
7156 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) | |
7157 } | |
7158 else{ | |
7159 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) | |
7160 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) | |
7161 } | |
7162 } | |
7163 elsif ($index == 1){ # CTOB | |
7164 unless($old_flag){ | |
7165 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) | |
7166 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) | |
7167 } | |
7168 else{ | |
7169 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) | |
7170 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) | |
7171 } | |
7172 } | |
7173 elsif ($index == 2){ # CTOT | |
7174 unless ($old_flag){ | |
7175 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) | |
7176 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) | |
7177 } | |
7178 else{ | |
7179 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) | |
7180 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) | |
7181 } | |
7182 } | |
7183 elsif ($index == 3){ # OB | |
7184 unless ($old_flag){ | |
7185 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) | |
7186 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) | |
7187 } | |
7188 else{ | |
7189 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) | |
7190 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) | |
7191 } | |
7192 } | |
7193 | |
7194 ##### | |
7195 | |
7196 my $mapq = 255; # Mapping quality is unavailable | |
7197 | |
7198 ##### | |
7199 | |
7200 my $cigar_1; | |
7201 my $cigar_2; | |
7202 | |
7203 if ($bowtie2){ | |
7204 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2 | |
7205 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2}; | |
7206 } | |
7207 else{ | |
7208 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches) | |
7209 $cigar_2 = length($actual_seq_2) . "M"; | |
7210 } | |
7211 | |
7212 ##### | |
7213 | |
7214 my $rnext = '='; # Chromosome of mate; applies to both reads | |
7215 | |
7216 ##### | |
7217 | |
7218 my $pnext_1 = $start_read_2; # Leftmost position of mate | |
7219 my $pnext_2 = $start_read_1; | |
7220 | |
7221 ##### | |
7222 | |
7223 my $tlen_1; # signed observed Template LENgth (or inferred fragment size) | |
7224 my $tlen_2; | |
7225 | |
7226 if ($bowtie2){ | |
7227 | |
7228 if ($start_read_1 <= $start_read_2){ | |
7229 | |
7230 # Read 1 alignment is leftmost | |
7231 | |
7232 if ($end_read_2 >= $end_read_1){ | |
7233 | |
7234 # -------------------------> read 1 reads overlapping | |
7235 # <------------------------- read 2 | |
7236 # | |
7237 # or | |
7238 # | |
7239 # -------------------------> read 1 | |
7240 # <----------------------- read 2 read 2 contained within read 1 | |
7241 # | |
7242 # or | |
7243 # | |
7244 # -------------------------> read 1 reads 1 and 2 exactly overlapping | |
7245 # <------------------------- read 2 | |
7246 # | |
7247 | |
7248 # dovetailing of reads is not enabled for Bowtie 2 alignments | |
7249 | |
7250 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, | |
7251 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign | |
7252 } | |
7253 elsif ($end_read_2 < $end_read_1){ | |
7254 | |
7255 # -------------------------> read 1 | |
7256 # <----------- read 2 read 2 contained within read 1 | |
7257 # | |
7258 # or | |
7259 # | |
7260 # -------------------------> read 1 | |
7261 # <------------------------ read 2 read 2 contained within read 1 | |
7262 | |
7263 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable | |
7264 $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, | |
7265 $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this | |
7266 ### as a request by frozenlyse on SeqAnswers on 24 July 2013 | |
7267 } | |
7268 | |
7269 } | |
7270 | |
7271 elsif ($start_read_2 < $start_read_1){ | |
7272 | |
7273 if ($end_read_1 >= $end_read_2){ | |
7274 | |
7275 # Read 2 alignment is leftmost | |
7276 | |
7277 # -------------------------> read 2 reads overlapping | |
7278 # <------------------------- read 1 | |
7279 # | |
7280 # or | |
7281 # | |
7282 # -------------------------> read 2 | |
7283 # <----------------------- read 1 read 1 contained within read 2 | |
7284 # | |
7285 # | |
7286 | |
7287 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, | |
7288 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign | |
7289 } | |
7290 elsif ($end_read_1 < $end_read_2){ | |
7291 | |
7292 # -------------------------> read 2 | |
7293 # <----------- read 1 read 1 contained within read 2 | |
7294 # | |
7295 # or | |
7296 # | |
7297 # -------------------------> read 2 | |
7298 # <------------------------ read 1 read 1 contained within read 2 | |
7299 | |
7300 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable | |
7301 $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, | |
7302 $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this | |
7303 ### as a request by frozenlyse on SeqAnswers on 24 July 2013 | |
7304 } | |
7305 } | |
7306 } | |
7307 | |
7308 else{ # Bowtie 1 | |
7309 | |
7310 if ($end_read_2 >= $end_read_1){ | |
7311 # Read 1 alignment is leftmost | |
7312 # -------------------------> read 1 | |
7313 # <------------------------- read 2 | |
7314 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing | |
7315 | |
7316 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, | |
7317 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign | |
7318 } | |
7319 else{ | |
7320 # Read 2 alignment is leftmost | |
7321 # -------------------------> read 2 | |
7322 # <------------------------- read 1 | |
7323 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing | |
7324 | |
7325 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, | |
7326 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign | |
7327 } | |
7328 } | |
7329 | |
7330 ##### | |
7331 | |
7332 # adjusting the strand of the sequence before we use them to generate mismatch strings | |
7333 if ($strand_1 eq '-'){ | |
7334 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand | |
7335 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence | |
7336 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well | |
7337 } | |
7338 if ($strand_2 eq '-'){ | |
7339 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand | |
7340 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence | |
7341 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well | |
7342 } | |
7343 | |
7344 # print "$actual_seq_1\n$ref_seq_1\n\n"; | |
7345 # print "$actual_seq_2\n$ref_seq_2\n\n"; | |
7346 | |
7347 ##### | |
7348 | |
7349 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence | |
7350 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2); | |
7351 if ($bowtie2){ | |
7352 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence | |
7353 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence | |
7354 } | |
7355 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences | |
7356 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences | |
7357 | |
7358 ##### | |
7359 | |
7360 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!) | |
7361 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2); | |
7362 | |
7363 ##### | |
7364 | |
7365 my $XM_tag_1; # Optional tag XM: Methylation call string | |
7366 my $XM_tag_2; | |
7367 | |
7368 if ($strand_1 eq '-'){ | |
7369 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented | |
7370 } | |
7371 else{ | |
7372 $XM_tag_1 = "XM:Z:$methcall_1"; | |
7373 } | |
7374 | |
7375 if ($strand_2 eq '-'){ | |
7376 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented | |
7377 } | |
7378 else{ | |
7379 $XM_tag_2 = "XM:Z:$methcall_2"; | |
7380 } | |
7381 | |
7382 ##### | |
7383 | |
7384 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state | |
7385 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state | |
7386 | |
7387 ##### | |
7388 | |
7389 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads | |
7390 | |
7391 ##### | |
7392 | |
7393 # Optionally calculating number of mismatches for Bowtie 2 alignments | |
7394 | |
7395 if ($non_bs_mm) { | |
7396 if ($bowtie2) { | |
7397 | |
7398 $number_of_mismatches_1 =~ s/-//; # removing the minus sign | |
7399 $number_of_mismatches_2 =~ s/-//; | |
7400 | |
7401 ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches | |
7402 | |
7403 ### CIGAR 1 | |
7404 if ($cigar_1 =~ /(D|I)/) { | |
7405 # warn "$cigar_1\n"; | |
7406 | |
7407 # parsing CIGAR string | |
7408 my @len = split (/\D+/,$cigar_1); # storing the length per operation | |
7409 my @ops = split (/\d+/,$cigar_1); # storing the operation | |
7410 shift @ops; # remove the empty first element | |
7411 die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
7412 | |
7413 foreach (0..$#len) { | |
7414 if ($ops[$_] eq 'M') { | |
7415 # warn "skipping\n"; | |
7416 next; # irrelevant | |
7417 } | |
7418 elsif ($ops[$_] eq 'I') { # insertion in the read sequence | |
7419 $number_of_mismatches_1 -= $insertion_open; | |
7420 $number_of_mismatches_1 -= $len[$_] * $insertion_extend; | |
7421 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; | |
7422 } | |
7423 elsif ($ops[$_] eq 'D') { # deletion in the read sequence | |
7424 $number_of_mismatches_1 -= $deletion_open; | |
7425 $number_of_mismatches_1 -= $len[$_] * $deletion_extend; | |
7426 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; | |
7427 } | |
7428 elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die | |
7429 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
7430 } | |
7431 else { | |
7432 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
7433 } | |
7434 } | |
7435 | |
7436 # warn "Alignment score $number_of_mismatches_1\n"; | |
7437 # print "Mismatches $number_of_mismatches_1\n\n"; | |
7438 } | |
7439 | |
7440 ### CIGAR 2 | |
7441 if ($cigar_2 =~ /(D|I)/) { | |
7442 # warn "$cigar_2\n"; | |
7443 | |
7444 # parsing CIGAR string | |
7445 my @len = split (/\D+/,$cigar_2); # storing the length per operation | |
7446 my @ops = split (/\d+/,$cigar_2); # storing the operation | |
7447 shift @ops; # remove the empty first element | |
7448 die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
7449 | |
7450 foreach (0..$#len) { | |
7451 if ($ops[$_] eq 'M') { | |
7452 # warn "skipping\n"; | |
7453 next; #irrelevant | |
7454 } | |
7455 elsif ($ops[$_] eq 'I') { # insertion in the read sequence | |
7456 $number_of_mismatches_2 -= $insertion_open; | |
7457 $number_of_mismatches_2 -= $len[$_] * $insertion_extend; | |
7458 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; | |
7459 } | |
7460 elsif ($ops[$_] eq 'D') { # deletion in the read sequence | |
7461 $number_of_mismatches_2 -= $deletion_open; | |
7462 $number_of_mismatches_2 -= $len[$_] * $deletion_extend; | |
7463 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; | |
7464 } | |
7465 elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die | |
7466 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
7467 } | |
7468 else { | |
7469 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
7470 } | |
7471 } | |
7472 } | |
7473 | |
7474 ### Now we have InDel corrected Alignment scores | |
7475 | |
7476 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the | |
7477 ### sequence contained more than 5 Ns, but this should occur close to never | |
7478 | |
7479 my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division | |
7480 my $seq_2_N_count = $number_of_mismatches_2 % 6; | |
7481 # warn "N count 1: $seq_1_N_count\n"; | |
7482 # warn "N count 2: $seq_2_N_count\n"; | |
7483 | |
7484 $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count; | |
7485 $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count; | |
7486 | |
7487 # warn "MM1 $number_of_mismatches_1 \n"; | |
7488 # warn "MM2 $number_of_mismatches_2 \n"; | |
7489 } | |
7490 } | |
7491 | |
7492 #### | |
7493 | |
7494 my $XA_tag = "XA:Z:$number_of_mismatches_1"; | |
7495 my $XB_tag = "XB:Z:$number_of_mismatches_2"; | |
7496 | |
7497 | |
7498 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields | |
7499 ### optionally print number of non-bisulfite mismatches | |
7500 if ($non_bs_mm){ | |
7501 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n"; | |
7502 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n"; | |
7503 } | |
7504 else{ # default | |
7505 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n"; | |
7506 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n"; | |
7507 } | |
7508 } | |
7509 | |
7510 sub revcomp{ | |
7511 my $seq = shift or die "Missing seq to reverse complement\n"; | |
7512 $seq = reverse $seq; | |
7513 $seq =~ tr/ACTGactg/TGACTGAC/; | |
7514 return $seq; | |
7515 } | |
7516 | |
7517 sub hemming_dist{ | |
7518 my $matches = 0; | |
7519 my @actual_seq = split //,(shift @_); | |
7520 my @ref_seq = split //,(shift @_); | |
7521 foreach (0..$#actual_seq){ | |
7522 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]); | |
7523 } | |
7524 return my $hd = scalar @actual_seq - $matches; | |
7525 } | |
7526 | |
7527 sub make_mismatch_string{ | |
7528 my $actual_seq = shift or die "Missing actual sequence"; | |
7529 my $ref_seq = shift or die "Missing reference sequence"; | |
7530 my $XX_tag = "XX:Z:"; | |
7531 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison | |
7532 my $prev_mm_pos = 0; | |
7533 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference | |
7534 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch | |
7535 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read | |
7536 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other | |
7537 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation) | |
7538 $prev_mm_pos = pos($tmp); # Position of last mismatch | |
7539 } | |
7540 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence | |
7541 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence | |
7542 return $XX_tag; | |
7543 } | |
7544 | |
7545 | |
7546 | |
7547 sub print_helpfile{ | |
7548 print << "HOW_TO"; | |
7549 | |
7550 | |
7551 This program is free software: you can redistribute it and/or modify | |
7552 it under the terms of the GNU General Public License as published by | |
7553 the Free Software Foundation, either version 3 of the License, or | |
7554 (at your option) any later version. | |
7555 | |
7556 This program is distributed in the hope that it will be useful, | |
7557 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
7558 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
7559 GNU General Public License for more details. | |
7560 You should have received a copy of the GNU General Public License | |
7561 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
7562 | |
7563 | |
7564 | |
7565 DESCRIPTION | |
7566 | |
7567 | |
7568 The following is a brief description of command line options and arguments to control the Bismark | |
7569 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the | |
7570 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand | |
7571 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). | |
7572 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome | |
7573 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the | |
7574 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) | |
7575 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original | |
7576 sequence from the genome and determine if there were any protected C's present or not. | |
7577 | |
7578 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be | |
7579 re-enabled by using --non_directional. | |
7580 | |
7581 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old | |
7582 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best | |
7583 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below. | |
7584 | |
7585 | |
7586 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} | |
7587 | |
7588 | |
7589 ARGUMENTS: | |
7590 | |
7591 <genome_folder> The path to the folder containing the unmodified reference genome | |
7592 as well as the subfolders created by the Bismark_Genome_Preparation | |
7593 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/). | |
7594 Bismark expects one or more fastA files in this folder (file extension: .fa | |
7595 or .fasta). The path can be relative or absolute. | |
7596 | |
7597 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes | |
7598 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must | |
7599 correspond file-for-file and read-for-read with those specified in <mates2>. | |
7600 Reads may be a mix of different lengths. Bismark will produce one mapping result | |
7601 and one report file per paired-end input file pair. | |
7602 | |
7603 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes | |
7604 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must | |
7605 correspond file-for-file and read-for-read with those specified in <mates1>. | |
7606 Reads may be a mix of different lengths. | |
7607 | |
7608 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g. | |
7609 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will | |
7610 produce one mapping result and one report file per input file. | |
7611 | |
7612 | |
7613 OPTIONS: | |
7614 | |
7615 | |
7616 Input: | |
7617 | |
7618 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ | |
7619 files (usually having extension .fg or .fastq). This is the default. See also | |
7620 --solexa-quals. | |
7621 | |
7622 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA | |
7623 files (usually havin extension .fa, .mfa, .fna or similar). All quality values | |
7624 are assumed to be 40 on the Phred scale. FASTA files are expected to contain both | |
7625 the read name and the sequence on a single line (and not spread over several lines). | |
7626 | |
7627 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input. | |
7628 | |
7629 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit. | |
7630 | |
7631 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. | |
7632 | |
7633 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. | |
7634 | |
7635 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled | |
7636 (which can't). The formula for conversion is: | |
7637 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This | |
7638 is usually the right option for use with (unconverted) reads emitted by the GA | |
7639 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. | |
7640 | |
7641 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) | |
7642 reads emitted by GA Pipeline version 1.3 or later. Default: off. | |
7643 | |
7644 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not | |
7645 specified it is assumed that Bowtie (1 or 2) is in the PATH. | |
7646 | |
7647 | |
7648 Alignment: | |
7649 | |
7650 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs | |
7651 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the | |
7652 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). | |
7653 | |
7654 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to | |
7655 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for | |
7656 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). | |
7657 | |
7658 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout | |
7659 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds | |
7660 quality values to the nearest 10 and saturates at 30. This value is not relevant for | |
7661 Bowtie 2. | |
7662 | |
7663 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in | |
7664 --best mode. Best-first search must keep track of many paths at once to ensure it is | |
7665 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the | |
7666 memory impact of the descriptors, but they can still grow very large in some cases. If | |
7667 you receive an error message saying that chunk memory has been exhausted in --best mode, | |
7668 try adjusting this parameter up to dedicate more memory to the descriptors. This value | |
7669 is not relevant for Bowtie 2. Default: 512. | |
7670 | |
7671 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and | |
7672 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation | |
7673 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also | |
7674 satisfied). A 19-bp gap would not be valid in that case. Default: 0. | |
7675 | |
7676 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and | |
7677 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a | |
7678 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). | |
7679 A 61-bp gap would not be valid in that case. Default: 500. | |
7680 | |
7681 | |
7682 Bowtie 1 Reporting: | |
7683 | |
7684 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option | |
7685 will be used by default. | |
7686 | |
7687 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum | |
7688 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in | |
7689 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred | |
7690 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both | |
7691 have Phred quality 10. When --best is not specified, Bowtie may report alignments that | |
7692 are sub-optimal in terms of stratum and/or quality (though an effort is made to report | |
7693 the best alignment). --best mode also removes all strand bias. Note that --best does not | |
7694 affect which alignments are considered "valid" by Bowtie, only which valid alignments | |
7695 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified. | |
7696 Default: on. | |
7697 | |
7698 --no_best Disables the --best option which is on by default. This can speed up the alignment process, | |
7699 e.g. for testing purposes, but for credible results it is not recommended to disable --best. | |
7700 | |
7701 | |
7702 Output: | |
7703 | |
7704 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four | |
7705 bisulfite strands will be reported. Default: OFF. | |
7706 | |
7707 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary | |
7708 to the original strands are merely theoretical and should not exist in reality. Specifying directional | |
7709 alignments (which is the default) will only run 2 alignment threads to the original top (OT) | |
7710 or bottom (OB) strands in parallel and report these alignments. This is the recommended option | |
7711 for sprand-specific libraries). | |
7712 | |
7713 --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al., | |
7714 PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode, | |
7715 as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT | |
7716 and OB ones. Use this option only if you are certain that your libraries were constructed following | |
7717 a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option | |
7718 --pbat works only for single-end and paired-end FastQ files for use with Bowtie1 (uncompressed | |
7719 temporary files only). | |
7720 | |
7721 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are | |
7722 split up into several smaller files to run concurrently and the output files are to be merged. | |
7723 | |
7724 --quiet Print nothing besides alignments. | |
7725 | |
7726 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead | |
7727 of SAM format output. | |
7728 | |
7729 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will | |
7730 appear as they did in the input, without any translation of quality values that may have | |
7731 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 | |
7732 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads | |
7733 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) | |
7734 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. | |
7735 | |
7736 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest | |
7737 mismatches or other reads that fail to align uniquely to a file in the output directory. | |
7738 Written reads will appear as they did in the input, without any of the translation of quality | |
7739 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two | |
7740 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and | |
7741 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. | |
7742 | |
7743 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into | |
7744 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt | |
7745 to create it first. The path to the output folder can be either relative or absolute. | |
7746 | |
7747 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If | |
7748 the specified folder does not exist, Bismark will attempt to create it first. The path to the | |
7749 temporary folder can be either relative or absolute. | |
7750 | |
7751 --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the | |
7752 alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is | |
7753 just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions. | |
7754 The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches' | |
7755 and 'XB:Z:number of mismatches' for read 2 of paired-end reads. | |
7756 | |
7757 --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk | |
7758 space. This option is available for most alignment modes but is not available for paired-end FastA | |
7759 files. This option might be somewhat slower than writing out uncompressed files, but this awaits | |
7760 further testing. | |
7761 | |
7762 --bam The output will be written out in BAM format instead of the default SAM format. Bismark will | |
7763 attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't | |
7764 been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found, | |
7765 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). | |
7766 | |
7767 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified | |
7768 explicitly if Samtools is in the PATH already. | |
7769 | |
7770 --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For | |
7771 example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. | |
7772 | |
7773 --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, | |
7774 this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since | |
7775 both the appended read IDs and custom FLAG values may cause problems with some downstream tools | |
7776 such as Picard, new defaults were implemented as of version 0.8.3. | |
7777 | |
7778 | |
7779 default old_flag | |
7780 =================== =================== | |
7781 Read 1 Read 2 Read 1 Read 2 | |
7782 | |
7783 OT: 99 147 67 131 | |
7784 | |
7785 OB: 83 163 115 179 | |
7786 | |
7787 CTOT: 99 147 67 131 | |
7788 | |
7789 CTOB: 83 163 115 179 | |
7790 | |
7791 | |
7792 | |
7793 Other: | |
7794 | |
7795 -h/--help Displays this help file. | |
7796 | |
7797 -v/--version Displays version information. | |
7798 | |
7799 | |
7800 BOWTIE 2 SPECIFIC OPTIONS | |
7801 | |
7802 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end | |
7803 alignments, i.e. searches for alignments involving all read characters (also called | |
7804 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter | |
7805 and/or quality trimmed where appropriate. Default: off. | |
7806 | |
7807 Bowtie 2 alignment options: | |
7808 | |
7809 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. | |
7810 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) | |
7811 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for | |
7812 Bowtie 1 see -n). | |
7813 | |
7814 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values | |
7815 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is | |
7816 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for | |
7817 Bowtie 1 see -l). | |
7818 | |
7819 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched | |
7820 position to be the highest possible, regardless of the actual value. I.e. input is treated | |
7821 as though all quality values are high. This is also the default behavior when the input | |
7822 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. | |
7823 | |
7824 | |
7825 Bowtie 2 paired-end options: | |
7826 | |
7827 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if | |
7828 it cannot find a concordant or discordant alignment for a pair. This option is invariable and | |
7829 and on by default. | |
7830 | |
7831 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. | |
7832 A discordant alignment is an alignment where both mates align uniquely, but that does not | |
7833 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior | |
7834 and it is on by default. | |
7835 | |
7836 | |
7837 Bowtie 2 effort options: | |
7838 | |
7839 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using | |
7840 the alignments found so far. A seed extension "fails" if it does not yield a new best or a | |
7841 new second-best alignment. Default: 15. | |
7842 | |
7843 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. | |
7844 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of | |
7845 mismatches allowed) at different offsets and searches for more alignments. A read is considered | |
7846 to have repetitive seeds if the total number of seed hits divided by the number of seeds | |
7847 that aligned at least once is greater than 300. Default: 2. | |
7848 | |
7849 Bowtie 2 parallelization options: | |
7850 | |
7851 | |
7852 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores | |
7853 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly | |
7854 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. | |
7855 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint | |
7856 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads | |
7857 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will | |
7858 automatically use the option '--reorder', which guarantees that output SAM records are printed in | |
7859 an order corresponding to the order of the reads in the original input file, even when -p is set | |
7860 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and | |
7861 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then | |
7862 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally | |
7863 correspond to input order in that case. | |
7864 | |
7865 Bowtie 2 Scoring options: | |
7866 | |
7867 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered | |
7868 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying | |
7869 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. | |
7870 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is | |
7871 L,0,-0.2. | |
7872 | |
7873 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty | |
7874 of <int1> + N * <int2>. Default: 5, 3. | |
7875 | |
7876 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets | |
7877 a penalty of <int1> + N * <int2>. Default: 5, 3. | |
7878 | |
7879 | |
7880 Bowtie 2 Reporting options: | |
7881 | |
7882 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is | |
7883 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the | |
7884 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the | |
7885 effort expended to find valid alignments. | |
7886 | |
7887 For reference, this used to be the old (now deprecated) description of -M: | |
7888 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it | |
7889 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever | |
7890 happens first. Only the best alignment is reported. Information from the other alignments is used to | |
7891 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes | |
7892 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that | |
7893 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not | |
7894 guarantee that the alignment reported is the best possible in terms of alignment score. -M is | |
7895 always used and its default value is set to 10. | |
7896 | |
7897 | |
7898 'VANILLA' Bismark OUTPUT: | |
7899 | |
7900 Single-end output format (tab-separated): | |
7901 | |
7902 (1) <seq-ID> | |
7903 (2) <read alignment strand> | |
7904 (3) <chromosome> | |
7905 (4) <start position> | |
7906 (5) <end position> | |
7907 (6) <observed bisulfite sequence> | |
7908 (7) <equivalent genomic sequence> | |
7909 (8) <methylation call> | |
7910 (9) <read conversion | |
7911 (10) <genome conversion> | |
7912 (11) <read quality score (Phred33)> | |
7913 | |
7914 | |
7915 Paired-end output format (tab-separated): | |
7916 (1) <seq-ID> | |
7917 (2) <read 1 alignment strand> | |
7918 (3) <chromosome> | |
7919 (4) <start position> | |
7920 (5) <end position> | |
7921 (6) <observed bisulfite sequence 1> | |
7922 (7) <equivalent genomic sequence 1> | |
7923 (8) <methylation call 1> | |
7924 (9) <observed bisulfite sequence 2> | |
7925 (10) <equivalent genomic sequence 2> | |
7926 (11) <methylation call 2> | |
7927 (12) <read 1 conversion | |
7928 (13) <genome conversion> | |
7929 (14) <read 1 quality score (Phred33)> | |
7930 (15) <read 2 quality score (Phred33)> | |
7931 | |
7932 | |
7933 Bismark SAM OUTPUT (default): | |
7934 | |
7935 (1) QNAME (seq-ID) | |
7936 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!)) | |
7937 (3) RNAME (chromosome) | |
7938 (4) POS (start position) | |
7939 (5) MAPQ (always 255) | |
7940 (6) CIGAR | |
7941 (7) RNEXT | |
7942 (8) PNEXT | |
7943 (9) TLEN | |
7944 (10) SEQ | |
7945 (11) QUAL (Phred33 scale) | |
7946 (12) NM-tag (edit distance to the reference) | |
7947 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels) | |
7948 (14) XM-tag (methylation call string) | |
7949 (15) XR-tag (read conversion state for the alignment) | |
7950 (16) XG-tag (genome conversion state for the alignment) | |
7951 (17) XA/XB-tag (non-bisulfite mismatches) (optional!) | |
7952 | |
7953 Each read of paired-end alignments is written out in a separate line in the above format. | |
7954 | |
7955 | |
7956 Last edited on 07 October 2013. | |
7957 | |
7958 HOW_TO | |
7959 } |