Mercurial > repos > bgruening > bismark
comparison new/bismark @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:0f8646f22b8d | 7:fcadce4d9a06 |
---|---|
1 #!/usr/bin/perl -- | |
2 use strict; | |
3 use warnings; | |
4 use IO::Handle; | |
5 use Cwd; | |
6 $|++; | |
7 use Getopt::Long; | |
8 | |
9 | |
10 ## This program is Copyright (C) 2010-15, Felix Krueger (felix.krueger@babraham.ac.uk) | |
11 | |
12 ## This program is free software: you can redistribute it and/or modify | |
13 ## it under the terms of the GNU General Public License as published by | |
14 ## the Free Software Foundation, either version 3 of the License, or | |
15 ## (at your option) any later version. | |
16 | |
17 ## This program is distributed in the hope that it will be useful, | |
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 ## GNU General Public License for more details. | |
21 | |
22 ## You should have received a copy of the GNU General Public License | |
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
24 | |
25 | |
26 my $parent_dir = getcwd; | |
27 my $bismark_version = 'v0.14.3'; | |
28 my $command_line = join (" ",@ARGV); | |
29 | |
30 | |
31 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail | |
32 foreach my $arg (@ARGV){ | |
33 if ($arg eq '--solexa1.3-quals'){ | |
34 $arg = '--phred64-quals'; | |
35 } | |
36 } | |
37 my @filenames; # will be populated by processing the command line | |
38 | |
39 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index,$multicore) = process_command_line(); | |
40 | |
41 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment | |
42 my %chromosomes; # stores the chromosome sequences of the mouse genome | |
43 my %SQ_order; # stores the order of sequences in the reference. This is to produce SAM/BAM files with a known order of chromosomes | |
44 my %counting; # counting various events | |
45 my @pids; # storing the process IDs of child processes in parallel mode | |
46 | |
47 | |
48 my $seqID_contains_tabs; | |
49 my $verbose = 0; | |
50 | |
51 if ($multicore > 1){ | |
52 warn "Running Bismark Parallel version. Number of parallel instances to be spawned: $multicore\n\n"; | |
53 } | |
54 | |
55 | |
56 sub multi_process_handling{ | |
57 | |
58 my $offset = 1; | |
59 my $process_id; | |
60 if ($multicore > 1){ | |
61 | |
62 until ($offset == $multicore){ | |
63 # warn "multicore: $multicore\noffset: $offset\n"; | |
64 my $fork = fork; | |
65 | |
66 if (defined $fork){ | |
67 if ($fork != 0){ | |
68 $process_id = $fork; | |
69 push @pids, $process_id; | |
70 if ($offset < $multicore){ | |
71 ++$offset; | |
72 # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n"; | |
73 } | |
74 else{ | |
75 # warn "Reached the number of maximum multicores. Proceeeding to processing...\n"; | |
76 } | |
77 } | |
78 elsif ($fork == 0){ | |
79 # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n"; | |
80 $process_id = $fork; | |
81 last; | |
82 } | |
83 } | |
84 else{ | |
85 die "Forking unsuccessful. Proceeding using a single thread only\n"; | |
86 } | |
87 } | |
88 | |
89 # warn "\nThe Thread Identity\n===================\n"; | |
90 if ($process_id){ | |
91 # print "I am the parent process. My children are called:\n"; | |
92 # print join ("\t",@pids),"\n"; | |
93 # print "I am going to process the following line count: $offset\n\n"; | |
94 } | |
95 elsif($process_id == 0){ | |
96 # warn "I am a child process: Process ID: $process_id\n"; | |
97 # warn "I am going to process the following line count: $offset\n\n"; | |
98 } | |
99 else{ | |
100 die "Process ID was: '$process_id'\n"; | |
101 } | |
102 } | |
103 else{ | |
104 warn "Single-core mode: setting pid to 1\n"; | |
105 $process_id = 1; | |
106 } | |
107 | |
108 return ($process_id,$offset); | |
109 } | |
110 | |
111 | |
112 sub subset_input_file_FastQ{ | |
113 | |
114 my ($filename,$process_id,$offset) = @_; | |
115 | |
116 if ($filename =~ /gz$/){ | |
117 open (OFFSET,"zcat $filename |") or die "Couldn't read from file '$filename': $!\n"; | |
118 } | |
119 else{ | |
120 open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n"; | |
121 } | |
122 | |
123 # warn "offset is $offset\n"; | |
124 my $temp = $filename; | |
125 $temp .= ".temp.$offset"; | |
126 $temp =~ s/^.*\///; # replacing everything upto and including the last /, i.e. removing file path information | |
127 | |
128 if ($gzip){ | |
129 $temp .= '.gz'; | |
130 open (TEMPFQ,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n"; | |
131 } | |
132 else{ | |
133 open (TEMPFQ,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n"; | |
134 } | |
135 | |
136 my $line_count = 0; | |
137 | |
138 while (1){ | |
139 my $l1 = <OFFSET>; | |
140 my $l2 = <OFFSET>; | |
141 my $l3 = <OFFSET>; | |
142 my $l4 = <OFFSET>; | |
143 | |
144 last unless ($l4); | |
145 ++$line_count; | |
146 | |
147 if ( ($line_count - $offset)%$multicore == 0){ | |
148 # warn "line count: $line_count\noffset: $offset\n"; | |
149 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; | |
150 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; | |
151 print TEMPFQ "$l1$l2$l3$l4"; | |
152 } | |
153 else{ | |
154 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; | |
155 next; | |
156 } | |
157 } | |
158 | |
159 close OFFSET or warn $!; | |
160 close TEMPFQ or warn "Failed to close file handle TEMPFQ: $!\n"; | |
161 | |
162 warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n"; | |
163 | |
164 return ($temp); # returning the subset filename | |
165 | |
166 } | |
167 | |
168 sub subset_input_file_FastA{ | |
169 | |
170 my ($filename,$process_id,$offset) = @_; | |
171 | |
172 if ($filename =~ /gz$/){ | |
173 open (OFFSET,"zcat $filename |") or die "Couldn't read from file '$filename': $!\n"; | |
174 } | |
175 else{ | |
176 open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n"; | |
177 } | |
178 | |
179 # warn "offset is $offset\n"; | |
180 my $temp = $filename; | |
181 $temp .= ".temp.$offset"; | |
182 | |
183 if ($gzip){ | |
184 $temp .= '.gz'; | |
185 open (TEMPFQ,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n"; | |
186 } | |
187 else{ | |
188 open (TEMPFQ,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n"; | |
189 } | |
190 | |
191 warn "Writing temporary infile to $temp\n"; | |
192 | |
193 my $line_count = 0; | |
194 | |
195 while (1){ | |
196 my $l1 = <OFFSET>; | |
197 my $l2 = <OFFSET>; | |
198 | |
199 last unless ($l2); | |
200 ++$line_count; | |
201 | |
202 if ( ($line_count - $offset)%$multicore == 0){ | |
203 # warn "line count: $line_count\noffset: $offset\n"; | |
204 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; | |
205 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; | |
206 print TEMPFQ "$l1$l2"; | |
207 } | |
208 else{ | |
209 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; | |
210 next; | |
211 } | |
212 } | |
213 | |
214 close OFFSET or warn $!; | |
215 close TEMPFQ or warn "Failed to close file handle TEMPFQ: $!\n"; | |
216 | |
217 warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n"; | |
218 | |
219 return ($temp); # returning the subset filename | |
220 | |
221 } | |
222 | |
223 ##### | |
224 ##### | |
225 | |
226 foreach my $filename (@filenames){ | |
227 | |
228 my $original_filename = $filename; | |
229 my $original_filename_1; | |
230 my $original_filename_2; | |
231 | |
232 chdir $parent_dir or die "Unable to move to initial working directory'$parent_dir' $!\n"; | |
233 ### resetting the counting hash and fhs | |
234 reset_counters_and_fhs($filename); | |
235 @pids = (); | |
236 $seqID_contains_tabs = 0; | |
237 | |
238 ### As of version 0.14.0 we support multi-threading. In a first instance we accomplish this by | |
239 ### splitting the input file(s) into several smaller subfiles and merging the results back at | |
240 ### the end. | |
241 | |
242 # get general settings (also for single-threaded use) | |
243 my ($pid,$offset) = multi_process_handling (); | |
244 | |
245 my ($single_end,$paired_end); | |
246 ### PAIRED-END ALIGNMENTS | |
247 if ($filename =~ ','){ | |
248 | |
249 $single_end = 0; | |
250 $paired_end = 1; | |
251 | |
252 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file | |
253 | |
254 $fhs[0]->{name} = 'CTread1GAread2CTgenome'; | |
255 $fhs[1]->{name} = 'GAread1CTread2GAgenome'; | |
256 $fhs[2]->{name} = 'GAread1CTread2CTgenome'; | |
257 $fhs[3]->{name} = 'CTread1GAread2GAgenome'; | |
258 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n"; | |
259 | |
260 my ($filename_1,$filename_2) = (split (/,/,$filename)); | |
261 $original_filename_1 = $filename_1; | |
262 $original_filename_2 = $filename_2; | |
263 | |
264 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n"; | |
265 | |
266 ### subsetting the input file(s) | |
267 unless ($multicore == 1){ # not needed in single-core mode | |
268 # warn "My PID: $pid\nMy offset: $offset\n"; | |
269 if ($sequence_file_format eq 'FASTA'){ | |
270 my $temp_filename_1 = subset_input_file_FastA($filename_1,$pid,$offset); | |
271 warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n"; | |
272 $filename_1 = "${temp_dir}$temp_filename_1"; | |
273 | |
274 my $temp_filename_2 = subset_input_file_FastA($filename_2,$pid,$offset); | |
275 warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n"; | |
276 $filename_2 = "${temp_dir}$temp_filename_2"; | |
277 } | |
278 else{ # FastQ format, default | |
279 my $temp_filename_1 = subset_input_file_FastQ($filename_1,$pid,$offset); | |
280 warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n"; | |
281 $filename_1 = "${temp_dir}$temp_filename_1"; | |
282 | |
283 my $temp_filename_2 = subset_input_file_FastQ($filename_2,$pid,$offset); | |
284 warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n"; | |
285 $filename_2 = "${temp_dir}$temp_filename_2"; | |
286 } | |
287 } | |
288 | |
289 ### additional variables only for paired-end alignments | |
290 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file | |
291 | |
292 ### FastA format | |
293 if ($sequence_file_format eq 'FASTA'){ | |
294 warn "Input files are in FastA format\n"; | |
295 | |
296 if ($directional){ | |
297 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number | |
298 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); | |
299 | |
300 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
301 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
302 $fhs[1]->{inputfile_1} = undef; | |
303 $fhs[1]->{inputfile_2} = undef; | |
304 $fhs[2]->{inputfile_1} = undef; | |
305 $fhs[2]->{inputfile_2} = undef; | |
306 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
307 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
308 } | |
309 else{ | |
310 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number | |
311 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); | |
312 | |
313 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
314 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
315 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
316 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
317 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
318 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
319 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
320 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
321 } | |
322 | |
323 if ($bowtie2){ | |
324 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
325 } | |
326 else{ | |
327 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
328 } | |
329 } | |
330 | |
331 ### FastQ format | |
332 else{ | |
333 warn "Input files are in FastQ format\n"; | |
334 if ($directional){ | |
335 if ($bowtie2){ | |
336 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
337 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
338 | |
339 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
340 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
341 $fhs[1]->{inputfile_1} = undef; | |
342 $fhs[1]->{inputfile_2} = undef; | |
343 $fhs[2]->{inputfile_1} = undef; | |
344 $fhs[2]->{inputfile_2} = undef; | |
345 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
346 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
347 } | |
348 else{ # Bowtie 1 alignments | |
349 if ($gzip){ | |
350 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time | |
351 | |
352 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format | |
353 $fhs[0]->{inputfile_2} = undef; # no longer needed | |
354 $fhs[1]->{inputfile_1} = undef; | |
355 $fhs[1]->{inputfile_2} = undef; | |
356 $fhs[2]->{inputfile_1} = undef; | |
357 $fhs[2]->{inputfile_2} = undef; | |
358 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format | |
359 $fhs[3]->{inputfile_2} = undef; # no longer needed | |
360 } | |
361 else{ | |
362 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
363 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
364 | |
365 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
366 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
367 $fhs[1]->{inputfile_1} = undef; | |
368 $fhs[1]->{inputfile_2} = undef; | |
369 $fhs[2]->{inputfile_1} = undef; | |
370 $fhs[2]->{inputfile_2} = undef; | |
371 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
372 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
373 } | |
374 } | |
375 } | |
376 elsif($pbat){ # PBAT-Seq. This works for both Bowtie and Bowtie 2 | |
377 ### At the moment we are only performing alignments only with uncompressed FastQ files | |
378 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
379 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
380 | |
381 $fhs[0]->{inputfile_1} = undef; | |
382 $fhs[0]->{inputfile_2} = undef; | |
383 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
384 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
385 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
386 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
387 $fhs[3]->{inputfile_1} = undef; | |
388 $fhs[3]->{inputfile_2} = undef; | |
389 } | |
390 else{ | |
391 if ($bowtie2){ | |
392 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
393 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
394 | |
395 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
396 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
397 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
398 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
399 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
400 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
401 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
402 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
403 } | |
404 else{ # Bowtie 1 alignments | |
405 if ($gzip){ | |
406 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time | |
407 | |
408 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
409 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files | |
410 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
411 $fhs[1]->{inputfile_2} = undef; | |
412 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
413 $fhs[2]->{inputfile_2} = undef; | |
414 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
415 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files | |
416 } | |
417 else{ # uncompressed temp files | |
418 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
419 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
420 | |
421 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
422 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
423 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
424 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
425 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
426 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
427 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
428 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
429 } | |
430 } | |
431 } | |
432 if ($bowtie2){ | |
433 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
434 } | |
435 else{ | |
436 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
437 } | |
438 } | |
439 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
440 } | |
441 | |
442 ### Else we are performing SINGLE-END ALIGNMENTS | |
443 else{ | |
444 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n"; | |
445 | |
446 $single_end = 1; | |
447 $paired_end = 0; | |
448 | |
449 ### subsetting the input file(s) | |
450 unless ($multicore == 1){ # not needed in single-core mode | |
451 # warn "My PID: $pid\nMy offset: $offset\n"; | |
452 if ($sequence_file_format eq 'FASTA'){ | |
453 my $temp_filename = subset_input_file_FastA($filename,$pid,$offset); | |
454 warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n"; | |
455 $filename = "${temp_dir}$temp_filename"; | |
456 } | |
457 else{ # FastQ format, default | |
458 my $temp_filename = subset_input_file_FastQ($filename,$pid,$offset); | |
459 warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n"; | |
460 $filename = "${temp_dir}$temp_filename"; | |
461 } | |
462 } | |
463 | |
464 ### Initialising bisulfite conversion filenames | |
465 my ($C_to_T_infile,$G_to_A_infile); | |
466 | |
467 ### FastA format | |
468 if ($sequence_file_format eq 'FASTA'){ | |
469 warn "Inut file is in FastA format\n"; | |
470 if ($directional){ | |
471 ($C_to_T_infile) = biTransformFastAFiles ($filename); | |
472 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
473 } | |
474 else{ | |
475 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename); | |
476 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
477 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; | |
478 } | |
479 | |
480 ### Creating 4 different bowtie filehandles and storing the first entry | |
481 if ($bowtie2){ | |
482 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile); | |
483 } | |
484 else{ | |
485 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile); | |
486 } | |
487 } | |
488 | |
489 ## FastQ format | |
490 else{ | |
491 warn "Input file is in FastQ format\n"; | |
492 if ($directional){ | |
493 ($C_to_T_infile) = biTransformFastQFiles ($filename); | |
494 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
495 } | |
496 elsif($pbat){ | |
497 ($G_to_A_infile) = biTransformFastQFiles ($filename); | |
498 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files | |
499 } | |
500 else{ | |
501 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename); | |
502 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
503 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; | |
504 } | |
505 | |
506 ### Creating up to 4 different bowtie filehandles and storing the first entry | |
507 if ($pbat){ | |
508 if ($bowtie2){ # as of version 0.10.2 we also support PBAT alignments for Bowtie 2 | |
509 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 (undef,$G_to_A_infile); | |
510 } | |
511 else{ | |
512 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile); | |
513 } | |
514 } | |
515 elsif ($bowtie2){ | |
516 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile); | |
517 } | |
518 else{ | |
519 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile); | |
520 } | |
521 } | |
522 | |
523 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile,$pid); | |
524 | |
525 } | |
526 | |
527 ### MERGING AND DELETING TEMP FILES // TIDYING UP AFTER A MULTICORE PROCESS | |
528 | |
529 if ($pid){ # only performing this for the parent process | |
530 | |
531 if ($multicore > 1){ | |
532 | |
533 warn "Now waiting for all child processes to complete\n"; | |
534 | |
535 ### we need to ensure that we wait for all child processes to be finished before continuing | |
536 # warn "here are the child IDs: @pids\n"; | |
537 # warn "Looping through the child process IDs:\n"; | |
538 | |
539 foreach my $id (@pids){ | |
540 # print "$id\t"; | |
541 my $kid = waitpid ($id,0); | |
542 # print "Returned: $kid\nExit status: $?\n"; | |
543 unless ($? == 0){ | |
544 warn "\nChild process terminated with exit signal: '$?'\n\n"; | |
545 } | |
546 } | |
547 | |
548 # regenerating names for temporary files | |
549 my @temp_input; | |
550 my @temp_output; | |
551 my @temp_reports; | |
552 my @temp_unmapped_1; # will store single end reads or R1 of paired-end | |
553 my @temp_unmapped_2; | |
554 my @temp_ambiguous_1; # will store single end reads or R1 of paired-end | |
555 my @temp_ambiguous_2; | |
556 | |
557 for (1..$offset){ | |
558 | |
559 # Temp Input Files | |
560 if ($single_end){ | |
561 if ($gzip){ | |
562 push @temp_input, "${original_filename}.temp.${_}.gz"; | |
563 } | |
564 else{ | |
565 push @temp_input, "${original_filename}.temp.${_}"; | |
566 } | |
567 | |
568 } | |
569 elsif($paired_end){ | |
570 if ($gzip){ | |
571 push @temp_input, "${original_filename_1}.temp.${_}.gz"; | |
572 push @temp_input, "${original_filename_2}.temp.${_}.gz"; | |
573 } | |
574 else{ | |
575 push @temp_input, "${original_filename_1}.temp.${_}"; | |
576 push @temp_input, "${original_filename_2}.temp.${_}"; | |
577 } | |
578 } | |
579 | |
580 # if files had a prefix we need to specify it | |
581 my $add_prefix; | |
582 if (defined $prefix){ | |
583 $add_prefix = "${prefix}."; | |
584 } | |
585 else{ | |
586 $add_prefix = ''; | |
587 } | |
588 | |
589 # Temp Output Files | |
590 if ($single_end){ | |
591 | |
592 if ($bowtie2){ | |
593 if ($gzip){ | |
594 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2.bam"; | |
595 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2_SE_report.txt"; | |
596 } | |
597 else{ | |
598 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2.bam"; | |
599 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2_SE_report.txt"; | |
600 } | |
601 } | |
602 else{ | |
603 if ($gzip){ | |
604 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark.bam"; | |
605 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_SE_report.txt"; | |
606 } | |
607 else{ | |
608 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark.bam"; | |
609 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_SE_report.txt"; | |
610 } | |
611 } | |
612 | |
613 if ($unmapped){ | |
614 if ($gzip){ | |
615 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_unmapped_reads.fq"; | |
616 } | |
617 else{ | |
618 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_unmapped_reads.fq"; | |
619 } | |
620 } | |
621 | |
622 if ($ambiguous){ | |
623 if ($gzip){ | |
624 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_ambiguous_reads.fq"; | |
625 } | |
626 else{ | |
627 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_ambiguous_reads.fq"; | |
628 } | |
629 } | |
630 | |
631 } | |
632 elsif($paired_end){ | |
633 if ($bowtie2){ | |
634 if ($gzip){ | |
635 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_pe.bam"; | |
636 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_PE_report.txt"; | |
637 } | |
638 else{ | |
639 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_pe.bam"; | |
640 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_PE_report.txt"; | |
641 } | |
642 } | |
643 else{ | |
644 if ($gzip){ | |
645 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_pe.bam"; | |
646 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_PE_report.txt"; | |
647 } | |
648 else{ | |
649 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_pe.bam"; | |
650 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_PE_report.txt"; | |
651 } | |
652 } | |
653 | |
654 if ($unmapped){ | |
655 if ($gzip){ | |
656 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_unmapped_reads_1.fq"; | |
657 push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_unmapped_reads_2.fq"; | |
658 } | |
659 else{ | |
660 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_unmapped_reads_1.fq"; | |
661 push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_unmapped_reads_2.fq"; | |
662 } | |
663 } | |
664 | |
665 if ($ambiguous){ | |
666 if ($gzip){ | |
667 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_ambiguous_reads_1.fq"; | |
668 push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_ambiguous_reads_2.fq"; | |
669 } | |
670 else{ | |
671 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_ambiguous_reads_1.fq"; | |
672 push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_ambiguous_reads_2.fq"; | |
673 } | |
674 } | |
675 | |
676 } | |
677 } | |
678 | |
679 warn "\n\nRight, cleaning up now...\n\n"; | |
680 | |
681 # deleting temp files; | |
682 warn "Deleting temporary sequence files...\n"; | |
683 foreach my $temp (@temp_input){ | |
684 #print "$temp\t"; | |
685 $temp =~ s/.*\///; # deleting path information | |
686 print "${temp_dir}${temp}\t"; | |
687 unlink "${temp_dir}${temp}" or warn "Failed to delete temporary FastQ file ${temp_dir}$temp: $!\n"; | |
688 } | |
689 print "\n\n"; | |
690 | |
691 # merging temp BAM files | |
692 if ($single_end){ | |
693 merge_individual_BAM_files(\@temp_output,$original_filename,$single_end); | |
694 } | |
695 else{ | |
696 merge_individual_BAM_files(\@temp_output,$original_filename_1,$single_end); | |
697 } | |
698 | |
699 # deleting temp BAM files | |
700 warn "Deleting temporary BAM files...\n"; | |
701 foreach my $temp (@temp_output){ | |
702 # print "$temp\t"; | |
703 $temp =~ s/.*\///; # deleting path information | |
704 print "${output_dir}${temp}\t"; | |
705 unlink "${output_dir}${temp}" or warn "Failed to delete temporary BAM file ${output_dir}${temp}: $!\n"; | |
706 } | |
707 print "\n\n"; | |
708 | |
709 if ($unmapped){ | |
710 if ($single_end){ | |
711 merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename,$single_end); | |
712 } | |
713 else{ | |
714 merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename_1,$single_end,'_1'); | |
715 merge_individual_unmapped_files(\@temp_unmapped_2,$original_filename_2,$single_end,'_2'); | |
716 } | |
717 | |
718 # deleting temp unmapped files | |
719 warn "Deleting temporary unmapped files...\n"; | |
720 foreach my $temp (@temp_unmapped_1){ | |
721 print "$temp\t"; | |
722 unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n"; | |
723 } | |
724 if ($paired_end){ | |
725 foreach my $temp (@temp_unmapped_2){ | |
726 print "$temp\t"; | |
727 unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n"; | |
728 } | |
729 } | |
730 print "\n\n"; | |
731 | |
732 } | |
733 | |
734 if ($ambiguous){ | |
735 if ($single_end){ | |
736 merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename,$single_end); | |
737 } | |
738 else{ | |
739 merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename_1,$single_end,'_1'); | |
740 merge_individual_ambiguous_files(\@temp_ambiguous_2,$original_filename_2,$single_end,'_2'); | |
741 } | |
742 | |
743 # deleting temp ambiguous files | |
744 warn "Deleting temporary ambiguous files...\n"; | |
745 foreach my $temp (@temp_ambiguous_1){ | |
746 print "$temp\t"; | |
747 unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n"; | |
748 } | |
749 | |
750 if ($paired_end){ | |
751 foreach my $temp (@temp_ambiguous_2){ | |
752 print "$temp\t"; | |
753 unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n"; | |
754 } | |
755 } | |
756 print "\n\n"; | |
757 } | |
758 | |
759 # resetting the counters once more so we can add all data from all temporary reports | |
760 reset_counters_and_fhs($original_filename); | |
761 | |
762 ### Merging the Bismark mapping report files | |
763 if ($single_end){ | |
764 merge_individual_splitting_reports(\@temp_reports,$original_filename,$single_end); | |
765 print_final_analysis_report_single_end('mock_file1','mock_file_2','mock_pid','mergeThis'); | |
766 } | |
767 else{ | |
768 merge_individual_splitting_reports(\@temp_reports,$original_filename_1,$single_end,$original_filename_2); | |
769 print_final_analysis_report_paired_ends('mock_file1','mock_file_2','mock_file3','mock_file_4','mock_pid','mergeThis'); | |
770 } | |
771 | |
772 # deleting temp report files | |
773 warn "Deleting temporary report files...\n"; | |
774 foreach my $temp (@temp_reports){ | |
775 print "$temp\t"; | |
776 unlink "${output_dir}${temp}" or warn "Failed to delete temporary report file $output_dir$temp: $!\n"; | |
777 } | |
778 print "\n\n"; | |
779 | |
780 } | |
781 | |
782 } | |
783 | |
784 if ($pid){ # only for the Parent | |
785 warn "\n====================\nBismark run complete\n====================\n\n"; | |
786 } | |
787 | |
788 } | |
789 | |
790 sub merge_individual_splitting_reports{ | |
791 | |
792 my ($temp_reports,$original_filename_1,$single_end,$original_filename_2) = @_; | |
793 my $report_file = $original_filename_1; | |
794 $report_file =~ s/.*\///; # removing path information | |
795 if ($prefix){ | |
796 $report_file = "${prefix}.${report_file}"; | |
797 } | |
798 | |
799 if ($basename){ # Output file basename is set using the -B argument | |
800 $report_file = ${basename}; | |
801 } | |
802 | |
803 if ($single_end){ | |
804 if ($bowtie2){ | |
805 $report_file .= '_bismark_bt2_SE_report.txt'; | |
806 } | |
807 else{ | |
808 $report_file .= '_bismark_SE_report.txt'; | |
809 } | |
810 } | |
811 else{ | |
812 if ($bowtie2){ | |
813 $report_file .= '_bismark_bt2_PE_report.txt'; | |
814 } | |
815 else{ | |
816 $report_file .= '_bismark_PE_report.txt'; | |
817 } | |
818 } | |
819 warn "Writing report to ${output_dir}${report_file}\n"; | |
820 open (REPORT,'>',"$output_dir$report_file") or die "Failed to write to ${output_dir}${report_file}: $!\n"; | |
821 | |
822 foreach my $temp(@$temp_reports){ | |
823 $temp =~ s/.*\///; # removing path information | |
824 } | |
825 | |
826 warn "Now merging temporary reports @$temp_reports into >>> ${output_dir}${report_file} <<<\n"; | |
827 | |
828 if ($single_end){ | |
829 print REPORT "Bismark report for: $original_filename_1 (version: $bismark_version)\n"; | |
830 } | |
831 else{ # paired-end | |
832 print REPORT "Bismark report for: $original_filename_1 and $original_filename_2 (version: $bismark_version)\n"; | |
833 } | |
834 | |
835 | |
836 my $first = 0; | |
837 | |
838 foreach my $temp(@$temp_reports){ | |
839 # $temp =~ s/.*\///; # removing path information | |
840 | |
841 warn "Merging from file >> $temp <<\n"; | |
842 open (IN,"${output_dir}${temp}") or die "Failed to read from temporary mapping report '${output_dir}${temp}'\n"; | |
843 | |
844 ### this is printing the first couple of lines | |
845 while (<IN>){ | |
846 chomp; | |
847 if ($_ =~ /^Bismark report/){ | |
848 next; | |
849 } | |
850 | |
851 unless ($first){ # only happens for the first run we are processing | |
852 if ($_ =~ /^Final Alignment/){ | |
853 ++$first; | |
854 last; | |
855 } | |
856 else{ | |
857 print REPORT "$_\n"; | |
858 } | |
859 } | |
860 } | |
861 close IN or warn "Failed to close filehandle\n"; | |
862 | |
863 ### Simon says: You are going to regret this in the future. Just for the record. He might be right... | |
864 read_alignment_report($temp,$single_end); | |
865 | |
866 } | |
867 warn "\n"; | |
868 | |
869 } | |
870 | |
871 sub read_alignment_report{ | |
872 my ($report,$single_end) = @_; | |
873 | |
874 my $unique; | |
875 my $no_aln; | |
876 my $multiple; | |
877 my $no_genomic; | |
878 my $total_seqs; | |
879 my $bismark_version; | |
880 my $input_filename; | |
881 | |
882 my $unique_text; | |
883 my $no_aln_text; | |
884 my $multiple_text; | |
885 my $total_seq_text; | |
886 | |
887 my $total_C_count; | |
888 my ($meth_CpG,$meth_CHG,$meth_CHH,$meth_unknown); | |
889 my ($unmeth_CpG,$unmeth_CHG,$unmeth_CHH,$unmeth_unknown); | |
890 | |
891 my $number_OT; | |
892 my $number_CTOT; | |
893 my $number_CTOB; | |
894 my $number_OB; | |
895 | |
896 open (ALN,"${output_dir}${report}") or die "Failed to read from temporary mapping report '$output_dir$report'\n"; | |
897 | |
898 while (<ALN>){ | |
899 chomp; | |
900 | |
901 ### General Alignment stats | |
902 if ($_ =~ /^Sequence pairs analysed in total:/ ){ ## Paired-end | |
903 (undef,$total_seqs) = split /\t/; | |
904 # warn "Total paired seqs: >> $total_seqs <<\n"; | |
905 } | |
906 elsif ($_ =~ /^Sequences analysed in total:/ ){ ## Single-end | |
907 (undef,$total_seqs) = split /\t/; | |
908 # warn "total single-end seqs >> $total_seqs <<\n"; | |
909 } | |
910 | |
911 elsif($_ =~ /^Number of paired-end alignments with a unique best hit:/){ ## Paired-end | |
912 (undef,$unique) = split /\t/; | |
913 # warn "Unique PE>> $unique <<\n"; | |
914 } | |
915 elsif($_ =~ /^Number of alignments with a unique best hit from/){ ## Single-end | |
916 (undef,$unique) = split /\t/; | |
917 # warn "Unique SE>> $unique <<\n"; | |
918 } | |
919 | |
920 elsif($_ =~ /^Sequence pairs with no alignments under any condition:/){ ## Paired-end | |
921 (undef,$no_aln) = split /\t/; | |
922 # warn "No alignment PE >> $no_aln <<\n"; | |
923 } | |
924 elsif($_ =~ /^Sequences with no alignments under any condition:/){ ## Single-end | |
925 (undef,$no_aln) = split /\t/; | |
926 # warn "No alignments SE>> $no_aln <<\n"; | |
927 } | |
928 | |
929 elsif($_ =~ /^Sequence pairs did not map uniquely:/){ ## Paired-end | |
930 (undef,$multiple) = split /\t/; | |
931 # warn "Multiple alignments PE >> $multiple <<\n"; | |
932 } | |
933 elsif($_ =~ /^Sequences did not map uniquely:/){ ## Single-end | |
934 (undef,$multiple) = split /\t/; | |
935 # warn "Multiple alignments SE >> $multiple <<\n"; | |
936 } | |
937 | |
938 elsif($_ =~ /^Sequence pairs which were discarded because genomic sequence could not be extracted:/){ ## Paired-end | |
939 (undef,$no_genomic) = split /\t/; | |
940 # warn "No genomic sequence PE >> $no_genomic <<\n"; | |
941 } | |
942 elsif($_ =~ /^Sequences which were discarded because genomic sequence could not be extracted:/){ ## Single-end | |
943 (undef,$no_genomic) = split /\t/; | |
944 # warn "No genomic sequence SE>> $no_genomic <<\n"; | |
945 } | |
946 | |
947 ### Context Methylation | |
948 elsif($_ =~ /^Total number of C/ ){ | |
949 (undef,$total_C_count) = split /\t/; | |
950 # warn "Total number C >> $total_C_count <<\n"; | |
951 } | |
952 | |
953 elsif($_ =~ /^Total methylated C\'s in CpG context:/ ){ | |
954 (undef,$meth_CpG) = split /\t/; | |
955 # warn "meth CpG >> $meth_CpG <<\n" ; | |
956 } | |
957 elsif($_ =~ /^Total methylated C\'s in CHG context:/ ){ | |
958 (undef,$meth_CHG) = split /\t/; | |
959 # warn "meth CHG >> $meth_CHG <<\n" ; | |
960 } | |
961 elsif($_ =~ /^Total methylated C\'s in CHH context:/ ){ | |
962 (undef,$meth_CHH) = split /\t/; | |
963 # warn "meth CHH >> $meth_CHH <<\n" ; | |
964 } | |
965 elsif($_ =~ /^Total methylated C\'s in Unknown context:/ ){ | |
966 (undef,$meth_unknown) = split /\t/; | |
967 # warn "meth Unknown >> $meth_unknown <<\n" ; | |
968 } | |
969 | |
970 elsif($_ =~ /^Total unmethylated C\'s in CpG context:/ or $_ =~ /^Total C to T conversions in CpG context:/){ | |
971 (undef,$unmeth_CpG) = split /\t/; | |
972 # warn "unmeth CpG >> $unmeth_CpG <<\n" ; | |
973 } | |
974 elsif($_ =~ /^Total unmethylated C\'s in CHG context:/ or $_ =~ /^Total C to T conversions in CHG context:/){ | |
975 (undef,$unmeth_CHG) = split /\t/; | |
976 # warn "unmeth CHG >> $unmeth_CHG <<\n" ; | |
977 } | |
978 elsif($_ =~ /^Total unmethylated C\'s in CHH context:/ or $_ =~ /^Total C to T conversions in CHH context:/){ | |
979 (undef,$unmeth_CHH) = split /\t/; | |
980 # warn "unmeth CHH >> $unmeth_CHH <<\n"; | |
981 } | |
982 elsif($_ =~ /^Total unmethylated C\'s in Unknown context:/ or $_ =~ /^Total C to T conversions in Unknown context:/){ | |
983 (undef,$unmeth_unknown) = split /\t/; | |
984 # warn "unmeth Unknown >> $unmeth_unknown <<\n" ; | |
985 } | |
986 | |
987 ### Strand Origin | |
988 | |
989 elsif($_ =~ /^CT\/GA\/CT:/ ){ ## Paired-end | |
990 (undef,$number_OT) = split /\t/; | |
991 # warn "Number OT PE>> $number_OT <<\n" ; | |
992 } | |
993 elsif($_ =~ /^CT\/CT:/ ){ ## Single-end | |
994 (undef,$number_OT) = split /\t/; | |
995 # warn "Number OT SE>> $number_OT <<\n" ; | |
996 } | |
997 | |
998 elsif($_ =~ /^GA\/CT\/CT:/ ){ ## Paired-end | |
999 (undef,$number_CTOT) = split /\t/; | |
1000 # warn "Number CTOT PE >> $number_CTOT <<\n" ; | |
1001 } | |
1002 elsif($_ =~ /^GA\/CT:/ ){ ## Single-end | |
1003 (undef,$number_CTOT) = split /\t/; | |
1004 # warn "Number CTOT SE >> $number_CTOT <<\n" ; | |
1005 } | |
1006 | |
1007 elsif($_ =~ /^GA\/CT\/GA:/ ){ ## Paired-end | |
1008 (undef,$number_CTOB) = split /\t/; | |
1009 # warn "Number CTOB PE >> $number_CTOB <<\n" ; | |
1010 } | |
1011 elsif($_ =~ /^GA\/GA:/ ){ ## Single-end | |
1012 (undef,$number_CTOB) = split /\t/; | |
1013 # warn "Number CTOB SE >> $number_CTOB <<\n"; | |
1014 } | |
1015 | |
1016 elsif($_ =~ /^CT\/GA\/GA:/ ){ ## Paired-end | |
1017 (undef,$number_OB) = split /\t/; | |
1018 # warn "Number OB PE >> $number_OB <<\n"; | |
1019 } | |
1020 elsif($_ =~ /^CT\/GA:/ ){ ## Single-end | |
1021 (undef,$number_OB) = split /\t/; | |
1022 # warn "Number OB SE >> $number_OB <<\n"; | |
1023 } | |
1024 } | |
1025 | |
1026 $counting{sequences_count} += $total_seqs; | |
1027 $counting{unique_best_alignment_count} += $unique; | |
1028 $counting{no_single_alignment_found} += $no_aln; | |
1029 $counting{unsuitable_sequence_count} += $multiple; | |
1030 $counting{genomic_sequence_could_not_be_extracted_count} += $no_genomic; | |
1031 | |
1032 $counting{total_meCHH_count} += $meth_CHH; | |
1033 $counting{total_meCHG_count} += $meth_CHG; | |
1034 $counting{total_meCpG_count} += $meth_CpG; | |
1035 if ($bowtie2){ | |
1036 $counting{total_meC_unknown_count} += $meth_unknown; | |
1037 } | |
1038 | |
1039 $counting{total_unmethylated_CHH_count} += $unmeth_CHH; | |
1040 $counting{total_unmethylated_CHG_count} += $unmeth_CHG; | |
1041 $counting{total_unmethylated_CpG_count} += $unmeth_CpG; | |
1042 if ($bowtie2){ | |
1043 $counting{total_unmethylated_C_unknown_count} += $unmeth_unknown; | |
1044 } | |
1045 | |
1046 if ($single_end){ | |
1047 $counting{CT_CT_count} += $number_OT; | |
1048 $counting{CT_GA_count} += $number_OB; | |
1049 $counting{GA_CT_count} += $number_CTOT; | |
1050 $counting{GA_GA_count} += $number_CTOB; | |
1051 } | |
1052 else{ | |
1053 # paired-end | |
1054 $counting{GA_CT_CT_count} += $number_CTOT; | |
1055 $counting{CT_GA_CT_count} += $number_OT; | |
1056 $counting{GA_CT_GA_count} += $number_CTOB; | |
1057 $counting{CT_GA_GA_count} += $number_OB; | |
1058 } | |
1059 } | |
1060 | |
1061 sub merge_individual_ambiguous_files{ | |
1062 | |
1063 my ($temp_ambiguous,$original_filename,$single_end,$paired_information) = @_; | |
1064 my $ambiguous_file = $original_filename; | |
1065 $ambiguous_file =~ s/.*\///; # removing path information | |
1066 | |
1067 if ($prefix){ | |
1068 $ambiguous_file = "${prefix}.${ambiguous_file}"; | |
1069 } | |
1070 | |
1071 if ($single_end){ | |
1072 | |
1073 if ($basename){ # Output file basename is set using the -B argument | |
1074 if ($sequence_file_format eq 'FASTQ'){ | |
1075 $ambiguous_file = "${basename}_ambiguous_reads.fq.gz"; | |
1076 } | |
1077 else{ | |
1078 $ambiguous_file = "${basename}_ambiguous_reads.fa.gz"; | |
1079 } | |
1080 } | |
1081 else{ | |
1082 if ($sequence_file_format eq 'FASTQ'){ | |
1083 $ambiguous_file =~ s/$/_ambiguous_reads.fq.gz/; | |
1084 } | |
1085 else{ | |
1086 $ambiguous_file =~ s/$/_ambiguous_reads.fa.gz/; | |
1087 } | |
1088 } | |
1089 } | |
1090 else{ # paired-end | |
1091 | |
1092 if ($basename){ # Output file basename is set using the -B argument | |
1093 if ($sequence_file_format eq 'FASTQ'){ | |
1094 $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fq.gz"; | |
1095 } | |
1096 else{ | |
1097 $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fa.gz"; | |
1098 } | |
1099 } | |
1100 else{ | |
1101 if ($sequence_file_format eq 'FASTQ'){ | |
1102 $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fq.gz/; | |
1103 } | |
1104 else{ | |
1105 $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fa.gz/; | |
1106 } | |
1107 } | |
1108 } | |
1109 | |
1110 foreach my $temp(@$temp_ambiguous){ | |
1111 $temp =~ s/.*\///; # removing path information | |
1112 } | |
1113 | |
1114 open (AMBIGUOUS,"| gzip -c - > $output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; | |
1115 warn "Now merging ambiguous sequences @$temp_ambiguous into >>> $output_dir$ambiguous_file <<<\n"; | |
1116 | |
1117 foreach my $temp(@$temp_ambiguous){ | |
1118 warn "Merging from file >> $temp <<\n"; | |
1119 if ($temp =~ /gz$/){ | |
1120 open (IN,"zcat ${output_dir}$temp |") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n"; | |
1121 } | |
1122 else{ | |
1123 open (IN,"${output_dir}$temp") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n"; | |
1124 } | |
1125 | |
1126 while (<IN>){ | |
1127 print AMBIGUOUS; | |
1128 } | |
1129 close IN or warn "Failed to close filehandle\n"; | |
1130 } | |
1131 warn "\n"; | |
1132 | |
1133 close AMBIGUOUS or warn "Failed to close output filehandle AMBIGUOUS\n\n"; | |
1134 } | |
1135 | |
1136 | |
1137 sub merge_individual_unmapped_files{ | |
1138 | |
1139 my ($temp_unmapped,$original_filename,$single_end,$paired_information) = @_; | |
1140 my $unmapped_file = $original_filename; | |
1141 $unmapped_file =~ s/.*\///; # removing path information | |
1142 | |
1143 if ($prefix){ | |
1144 $unmapped_file = "${prefix}.${unmapped_file}"; | |
1145 } | |
1146 | |
1147 if ($single_end){ | |
1148 | |
1149 if ($basename){ # Output file basename is set using the -B argument | |
1150 if ($sequence_file_format eq 'FASTQ'){ | |
1151 $unmapped_file = "${basename}_unmapped_reads.fq.gz"; | |
1152 } | |
1153 else{ | |
1154 $unmapped_file = "${basename}_unmapped_reads.fa.gz"; | |
1155 } | |
1156 } | |
1157 else{ | |
1158 if ($sequence_file_format eq 'FASTQ'){ | |
1159 $unmapped_file =~ s/$/_unmapped_reads.fq.gz/; | |
1160 } | |
1161 else{ | |
1162 $unmapped_file =~ s/$/_unmapped_reads.fa.gz/; | |
1163 } | |
1164 } | |
1165 } | |
1166 else{ # paired-end | |
1167 | |
1168 if ($basename){ # Output file basename is set using the -B argument | |
1169 if ($sequence_file_format eq 'FASTQ'){ | |
1170 $unmapped_file = "${basename}_unmapped_reads${paired_information}.fq.gz"; | |
1171 } | |
1172 else{ | |
1173 $unmapped_file = "${basename}_unmapped_reads${paired_information}.fa.gz"; | |
1174 } | |
1175 } | |
1176 else{ | |
1177 if ($sequence_file_format eq 'FASTQ'){ | |
1178 $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fq.gz/; | |
1179 } | |
1180 else{ | |
1181 $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fa.gz/; | |
1182 } | |
1183 } | |
1184 } | |
1185 | |
1186 foreach my $temp(@$temp_unmapped){ | |
1187 $temp =~ s/.*\///; # removing path information | |
1188 } | |
1189 | |
1190 open (UNMAPPED,"| gzip -c - > ${output_dir}${unmapped_file}") or die "Failed to write to ${output_dir}${unmapped_file}: $!\n"; | |
1191 warn "Now merging unmapped sequences @$temp_unmapped into >>> ${output_dir}${unmapped_file} <<<\n"; | |
1192 | |
1193 foreach my $temp(@$temp_unmapped){ | |
1194 warn "Merging from file >> $temp <<\n"; | |
1195 if ($temp =~ /gz$/){ | |
1196 open (IN,"zcat ${output_dir}${temp} |") or die "Failed to read from unmapped temp file '${output_dir}$temp'\n"; | |
1197 } | |
1198 else{ | |
1199 open (IN,"${output_dir}${temp}") or die "Failed to read from unmapped temp file '${output_dir}${temp}'\n"; | |
1200 } | |
1201 | |
1202 while (<IN>){ | |
1203 print UNMAPPED; | |
1204 } | |
1205 close IN or warn "Failed to close filehandle\n"; | |
1206 } | |
1207 warn "\n"; | |
1208 | |
1209 close UNMAPPED or warn "Failed to close output filehandle UNMAPPED\n\n"; | |
1210 } | |
1211 | |
1212 | |
1213 sub merge_individual_BAM_files{ | |
1214 | |
1215 my ($tempbam,$original_filename,$single_end) = @_; | |
1216 my $merged_name = $original_filename; | |
1217 | |
1218 # warn "merged name is: $merged_name\n"; | |
1219 $merged_name =~ s/.*\///; # deleting path information | |
1220 # warn "merged name is: $merged_name\n"; sleep(1); | |
1221 | |
1222 foreach my $temp_bam(@$tempbam){ | |
1223 $temp_bam =~ s/.*\///; # deleting path information | |
1224 } | |
1225 | |
1226 if ($prefix){ | |
1227 $merged_name = "$prefix.$merged_name"; | |
1228 } | |
1229 | |
1230 if ($single_end){ | |
1231 if ($bowtie2){ # BAM format is the default for Bowtie 2 | |
1232 $merged_name .= '_bismark_bt2.bam'; | |
1233 } | |
1234 else{ # BAM is the default output | |
1235 $merged_name .= '_bismark.bam'; | |
1236 } | |
1237 | |
1238 if ($basename){ # Output file basename is set using the -B argument | |
1239 $merged_name = "${basename}.bam"; | |
1240 } | |
1241 } | |
1242 else{ | |
1243 if ($bowtie2){ # BAM format is the default for Bowtie 2 | |
1244 $merged_name .= '_bismark_bt2_pe.bam'; | |
1245 } | |
1246 else{ # BAM is the default output | |
1247 $merged_name .= '_bismark_pe.bam'; | |
1248 } | |
1249 | |
1250 if ($basename){ # Output file basename is set using the -B argument | |
1251 $merged_name = "${basename}_pe.bam"; | |
1252 } | |
1253 } | |
1254 | |
1255 warn "Now merging BAM files @$tempbam into >>> $merged_name <<<\n"; | |
1256 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to $merged_name: $!\n"; | |
1257 my $first = 0; | |
1258 | |
1259 foreach my $temp_bam(@$tempbam){ | |
1260 # $temp_bam =~ s/.*\///; # deleting path information | |
1261 | |
1262 warn "Merging from file >> $temp_bam <<\n"; | |
1263 | |
1264 if ($first > 0){ | |
1265 open (IN,"$samtools_path view ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; | |
1266 } | |
1267 else{ # only for the first file we print the header as well | |
1268 open (IN,"$samtools_path view -h ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; | |
1269 } | |
1270 | |
1271 while (<IN>){ | |
1272 print OUT; | |
1273 } | |
1274 close IN or warn "Failed to close filehandle\n"; | |
1275 ++$first; | |
1276 } | |
1277 warn "\n"; | |
1278 | |
1279 close OUT or warn "Failed to close output filehandle\n\n"; | |
1280 } | |
1281 | |
1282 sub start_methylation_call_procedure_single_ends { | |
1283 my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; | |
1284 my ($dir,$filename); | |
1285 | |
1286 if ($sequence_file =~ /\//){ | |
1287 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/; | |
1288 } | |
1289 else{ | |
1290 $filename = $sequence_file; | |
1291 } | |
1292 | |
1293 ### printing all alignments to a results file | |
1294 my $outfile = $filename; | |
1295 if ($prefix){ | |
1296 $outfile = "$prefix.$outfile"; | |
1297 } | |
1298 if ($bowtie2){ # SAM format is the default for Bowtie 2 | |
1299 $outfile =~ s/$/_bismark_bt2.sam/; | |
1300 } | |
1301 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) | |
1302 $outfile =~ s/$/_bismark.txt/; | |
1303 } | |
1304 else{ # SAM is the default output | |
1305 $outfile =~ s/$/_bismark.sam/; | |
1306 } | |
1307 | |
1308 if ($basename){ # Output file basename is set using the -B argument | |
1309 $outfile = "${basename}.sam"; | |
1310 } | |
1311 | |
1312 $bam = 0 unless (defined $bam); | |
1313 | |
1314 if ($bam == 1){ ### Samtools is installed, writing out BAM directly | |
1315 $outfile =~ s/sam$/bam/; | |
1316 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1317 } | |
1318 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead | |
1319 $outfile .= '.gz'; | |
1320 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1321 } | |
1322 else{ # uncompressed ouput, default | |
1323 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1324 } | |
1325 | |
1326 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n"; | |
1327 sleep(1); | |
1328 | |
1329 if ($vanilla){ | |
1330 print OUT "Bismark version: $bismark_version\n"; | |
1331 } | |
1332 | |
1333 ### printing alignment and methylation call summary to a report file | |
1334 my $reportfile = $filename; | |
1335 if ($prefix){ | |
1336 $reportfile = "$prefix.$reportfile"; | |
1337 } | |
1338 if ($bowtie2){ | |
1339 $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; | |
1340 } | |
1341 else{ | |
1342 $reportfile =~ s/$/_bismark_SE_report.txt/; | |
1343 } | |
1344 | |
1345 if ($basename){ # Output file basename is set using the -B argument | |
1346 $reportfile = "${basename}_SE_report.txt"; | |
1347 } | |
1348 | |
1349 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; | |
1350 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; | |
1351 | |
1352 if ($unmapped){ | |
1353 my $unmapped_file = $filename; | |
1354 if ($prefix){ | |
1355 $unmapped_file = "$prefix.$unmapped_file"; | |
1356 } | |
1357 | |
1358 if ($basename){ # Output file basename is set using the -B argument | |
1359 if ($sequence_file_format eq 'FASTQ'){ | |
1360 $unmapped_file = "${basename}_unmapped_reads.fq"; | |
1361 } | |
1362 else{ | |
1363 $unmapped_file = "${basename}_unmapped_reads.fa"; | |
1364 } | |
1365 } | |
1366 else{ | |
1367 if ($sequence_file_format eq 'FASTQ'){ | |
1368 $unmapped_file =~ s/$/_unmapped_reads.fq/; | |
1369 } | |
1370 else{ | |
1371 $unmapped_file =~ s/$/_unmapped_reads.fa/; | |
1372 } | |
1373 } | |
1374 | |
1375 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; | |
1376 warn "Unmapped sequences will be written to $output_dir$unmapped_file\n"; | |
1377 } | |
1378 | |
1379 if ($ambiguous){ | |
1380 my $ambiguous_file = $filename; | |
1381 | |
1382 if ($prefix){ | |
1383 $ambiguous_file = "$prefix.$ambiguous_file"; | |
1384 } | |
1385 | |
1386 if ($basename){ # Output file basename is set using the -B argument | |
1387 if ($sequence_file_format eq 'FASTQ'){ | |
1388 $ambiguous_file = "${basename}_ambiguous_reads.fq"; | |
1389 } | |
1390 else{ | |
1391 $ambiguous_file = "${basename}_ambiguous_reads.fa"; | |
1392 } | |
1393 } | |
1394 else{ | |
1395 if ($sequence_file_format eq 'FASTQ'){ | |
1396 $ambiguous_file =~ s/$/_ambiguous_reads.fq/; | |
1397 } | |
1398 else{ | |
1399 $ambiguous_file =~ s/$/_ambiguous_reads.fa/; | |
1400 } | |
1401 } | |
1402 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; | |
1403 warn "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; | |
1404 } | |
1405 | |
1406 if ($directional){ | |
1407 print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n"; | |
1408 } | |
1409 elsif ($pbat){ | |
1410 print REPORT "Option '--pbat' specified: alignments to original strands (OT and OB) strands were ignored (i.e. not performed)\n"; | |
1411 } | |
1412 else{ | |
1413 print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n"; | |
1414 } | |
1415 | |
1416 if ($bowtie2){ | |
1417 print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
1418 } | |
1419 else{ | |
1420 print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
1421 } | |
1422 | |
1423 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time | |
1424 unless (%chromosomes){ | |
1425 my $cwd = getcwd; # storing the path of the current working directory | |
1426 print "Current working directory is: $cwd\n\n"; | |
1427 read_genome_into_memory($cwd); | |
1428 } | |
1429 | |
1430 unless ($vanilla or $sam_no_hd){ | |
1431 generate_SAM_header(); | |
1432 } | |
1433 | |
1434 ### Input file is in FastA format | |
1435 if ($sequence_file_format eq 'FASTA'){ | |
1436 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid); | |
1437 } | |
1438 ### Input file is in FastQ format | |
1439 else{ | |
1440 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid); | |
1441 } | |
1442 } | |
1443 | |
1444 sub start_methylation_call_procedure_paired_ends { | |
1445 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; | |
1446 | |
1447 my ($dir_1,$filename_1); | |
1448 | |
1449 if ($sequence_file_1 =~ /\//){ | |
1450 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/; | |
1451 } | |
1452 else{ | |
1453 $filename_1 = $sequence_file_1; | |
1454 } | |
1455 | |
1456 my ($dir_2,$filename_2); | |
1457 | |
1458 if ($sequence_file_2 =~ /\//){ | |
1459 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/; | |
1460 } | |
1461 else{ | |
1462 $filename_2 = $sequence_file_2; | |
1463 } | |
1464 | |
1465 ### printing all alignments to a results file | |
1466 my $outfile = $filename_1; | |
1467 | |
1468 if ($prefix){ | |
1469 $outfile = "$prefix.$outfile"; | |
1470 } | |
1471 if ($bowtie2){ # SAM format is the default Bowtie 2 output | |
1472 $outfile =~ s/$/_bismark_bt2_pe.sam/; | |
1473 } | |
1474 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) | |
1475 $outfile =~ s/$/_bismark_pe.txt/; | |
1476 } | |
1477 else{ # SAM format is the default Bowtie 1 output | |
1478 $outfile =~ s/$/_bismark_pe.sam/; | |
1479 } | |
1480 | |
1481 if ($basename){ # Output file basename is set using the -B argument | |
1482 $outfile = "${basename}_pe.sam"; | |
1483 } | |
1484 | |
1485 | |
1486 $bam = 0 unless (defined $bam); | |
1487 | |
1488 if ($bam == 1){ ### Samtools is installed, writing out BAM directly | |
1489 $outfile =~ s/sam$/bam/; | |
1490 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1491 } | |
1492 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead | |
1493 $outfile .= '.gz'; | |
1494 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1495 } | |
1496 else{ # uncompressed ouput, default | |
1497 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1498 } | |
1499 | |
1500 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n"; | |
1501 sleep(1); | |
1502 | |
1503 if ($vanilla){ | |
1504 print OUT "Bismark version: $bismark_version\n"; | |
1505 } | |
1506 | |
1507 ### printing alignment and methylation call summary to a report file | |
1508 my $reportfile = $filename_1; | |
1509 if ($prefix){ | |
1510 $reportfile = "$prefix.$reportfile"; | |
1511 } | |
1512 | |
1513 if ($bowtie2){ | |
1514 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; | |
1515 } | |
1516 else{ | |
1517 $reportfile =~ s/$/_bismark_PE_report.txt/; | |
1518 } | |
1519 | |
1520 if ($basename){ # Output file basename is set using the -B argument | |
1521 $reportfile = "${basename}_PE_report.txt"; | |
1522 } | |
1523 | |
1524 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; | |
1525 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n"; | |
1526 | |
1527 if ($bowtie2){ | |
1528 print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n"; | |
1529 } | |
1530 else{ | |
1531 print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n"; | |
1532 } | |
1533 | |
1534 | |
1535 ### Unmapped read output | |
1536 if ($unmapped){ | |
1537 my $unmapped_1 = $filename_1; | |
1538 my $unmapped_2 = $filename_2; | |
1539 | |
1540 if ($prefix){ | |
1541 $unmapped_1 = "$prefix.$unmapped_1"; | |
1542 $unmapped_2 = "$prefix.$unmapped_2"; | |
1543 } | |
1544 | |
1545 if ($basename){ # Output file basename is set using the -B argument | |
1546 if ($sequence_file_format eq 'FASTQ'){ | |
1547 $unmapped_1 = "${basename}_unmapped_reads_1.fq"; | |
1548 $unmapped_2 = "${basename}_unmapped_reads_2.fq"; | |
1549 } | |
1550 else{ | |
1551 $unmapped_1 = "${basename}_unmapped_reads_1.fa"; | |
1552 $unmapped_2 = "${basename}_unmapped_reads_2.fa"; | |
1553 } | |
1554 } | |
1555 else{ | |
1556 if ($sequence_file_format eq 'FASTQ'){ | |
1557 $unmapped_1 =~ s/$/_unmapped_reads_1.fq/; | |
1558 $unmapped_2 =~ s/$/_unmapped_reads_2.fq/; | |
1559 } | |
1560 else{ | |
1561 $unmapped_1 =~ s/$/_unmapped_reads_1.fa/; | |
1562 $unmapped_2 =~ s/$/_unmapped_reads_2.fa/; | |
1563 } | |
1564 } | |
1565 | |
1566 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; | |
1567 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; | |
1568 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; | |
1569 } | |
1570 | |
1571 if ($ambiguous){ | |
1572 my $amb_1 = $filename_1; | |
1573 my $amb_2 = $filename_2; | |
1574 | |
1575 if ($prefix){ | |
1576 $amb_1 = "$prefix.$amb_1"; | |
1577 $amb_2 = "$prefix.$amb_2"; | |
1578 } | |
1579 | |
1580 if ($basename){ # Output file basename is set using the -B argument | |
1581 if ($sequence_file_format eq 'FASTQ'){ | |
1582 $amb_1 = "${basename}_ambiguous_reads_1.fq"; | |
1583 $amb_2 = "${basename}_ambiguous_reads_2.fq"; | |
1584 } | |
1585 else{ | |
1586 $amb_1 = "${basename}_ambiguous_reads_1.fa"; | |
1587 $amb_2 = "${basename}_ambiguous_reads_2.fa"; | |
1588 } | |
1589 } | |
1590 else{ | |
1591 if ($sequence_file_format eq 'FASTQ'){ | |
1592 $amb_1 =~ s/$/_ambiguous_reads_1.fq/; | |
1593 $amb_2 =~ s/$/_ambiguous_reads_2.fq/; | |
1594 } | |
1595 else{ | |
1596 $amb_1 =~ s/$/_ambiguous_reads_1.fa/; | |
1597 $amb_2 =~ s/$/_ambiguous_reads_2.fa/; | |
1598 } | |
1599 } | |
1600 | |
1601 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; | |
1602 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; | |
1603 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; | |
1604 } | |
1605 | |
1606 if ($directional){ | |
1607 print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n\n"; | |
1608 } | |
1609 elsif ($pbat){ | |
1610 print REPORT "Option '--pbat' specified: alignments to original strands (OT, OB) were ignored (i.e. not performed)\n\n"; | |
1611 } | |
1612 else{ | |
1613 print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n\n"; | |
1614 } | |
1615 | |
1616 | |
1617 | |
1618 | |
1619 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time | |
1620 unless (%chromosomes){ | |
1621 my $cwd = getcwd; # storing the path of the current working directory | |
1622 warn "Current working directory is: $cwd\n\n"; | |
1623 read_genome_into_memory($cwd); | |
1624 } | |
1625 | |
1626 unless ($vanilla or $sam_no_hd){ | |
1627 generate_SAM_header(); | |
1628 } | |
1629 | |
1630 ### Input files are in FastA format | |
1631 if ($sequence_file_format eq 'FASTA'){ | |
1632 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
1633 } | |
1634 ### Input files are in FastQ format | |
1635 else{ | |
1636 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
1637 } | |
1638 } | |
1639 | |
1640 sub print_final_analysis_report_single_end{ | |
1641 my ($C_to_T_infile,$G_to_A_infile,$pid,$merge_multi) = @_; | |
1642 | |
1643 if ($merge_multi){ | |
1644 warn "Printing a final merged alignment report for all individual sub-reports\n\n"; | |
1645 } | |
1646 else{ | |
1647 ### All sequences from the original sequence file have been analysed now | |
1648 ### deleting temporary C->T or G->A infiles | |
1649 | |
1650 if ($directional){ | |
1651 my $deletion_successful = unlink "$temp_dir$C_to_T_infile"; | |
1652 if ($deletion_successful == 1){ | |
1653 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n"; | |
1654 } | |
1655 else{ | |
1656 warn "Could not delete temporary file $C_to_T_infile properly $!\n"; | |
1657 } | |
1658 } | |
1659 elsif ($pbat){ | |
1660 my $deletion_successful = unlink "$temp_dir$G_to_A_infile"; | |
1661 if ($deletion_successful == 1){ | |
1662 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n"; | |
1663 } | |
1664 else{ | |
1665 warn "Could not delete temporary file $G_to_A_infile properly $!\n"; | |
1666 } | |
1667 } | |
1668 else{ | |
1669 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile"; | |
1670 if ($deletion_successful == 2){ | |
1671 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n"; | |
1672 } | |
1673 else{ | |
1674 warn "Could not delete temporary files properly $!\n"; | |
1675 } | |
1676 } | |
1677 } | |
1678 | |
1679 ### printing a final report for the alignment procedure | |
1680 print REPORT "Final Alignment report\n",'='x22,"\n"; | |
1681 warn "Final Alignment report\n",'='x22,"\n"; | |
1682 # foreach my $index (0..$#fhs){ | |
1683 # print "$fhs[$index]->{name}\n"; | |
1684 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; | |
1685 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; | |
1686 # } | |
1687 | |
1688 ### printing a final report for the methylation call procedure | |
1689 warn "Sequences analysed in total:\t$counting{sequences_count}\n"; | |
1690 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n"; | |
1691 my $percent_alignable_sequences; | |
1692 | |
1693 if ($counting{sequences_count} == 0){ | |
1694 $percent_alignable_sequences = 0; | |
1695 } | |
1696 else{ | |
1697 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); | |
1698 } | |
1699 | |
1700 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n"; | |
1701 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n"; | |
1702 | |
1703 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads), | |
1704 ### only calculating the percentage if there were any overruled alignments | |
1705 if ($counting{low_complexity_alignments_overruled_count}){ | |
1706 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count}); | |
1707 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n"; | |
1708 } | |
1709 | |
1710 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
1711 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
1712 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
1713 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; | |
1714 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; | |
1715 | |
1716 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
1717 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
1718 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
1719 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; | |
1720 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; | |
1721 | |
1722 if ($directional){ | |
1723 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
1724 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
1725 } | |
1726 | |
1727 ### detailed information about Cs analysed | |
1728 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1729 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; | |
1730 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1731 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1732 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1733 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
1734 if ($bowtie2){ | |
1735 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; | |
1736 } | |
1737 warn "\n"; | |
1738 | |
1739 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1740 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1741 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
1742 if ($bowtie2){ | |
1743 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
1744 } | |
1745 warn "\n"; | |
1746 | |
1747 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1748 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1749 | |
1750 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1751 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1752 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
1753 if ($bowtie2){ | |
1754 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; | |
1755 } | |
1756 print REPORT "\n"; | |
1757 | |
1758 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1759 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1760 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
1761 if ($bowtie2){ | |
1762 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
1763 } | |
1764 print REPORT "\n"; | |
1765 | |
1766 my $percent_meCHG; | |
1767 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
1768 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
1769 } | |
1770 | |
1771 my $percent_meCHH; | |
1772 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
1773 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
1774 } | |
1775 | |
1776 my $percent_meCpG; | |
1777 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
1778 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
1779 } | |
1780 | |
1781 my $percent_meC_unknown; | |
1782 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ | |
1783 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); | |
1784 } | |
1785 | |
1786 | |
1787 ### printing methylated CpG percentage if applicable | |
1788 if ($percent_meCpG){ | |
1789 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1790 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1791 } | |
1792 else{ | |
1793 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1794 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1795 } | |
1796 | |
1797 ### printing methylated C percentage (CHG context) if applicable | |
1798 if ($percent_meCHG){ | |
1799 warn "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
1800 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
1801 } | |
1802 else{ | |
1803 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
1804 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
1805 } | |
1806 | |
1807 ### printing methylated C percentage (CHH context) if applicable | |
1808 if ($percent_meCHH){ | |
1809 warn "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
1810 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
1811 } | |
1812 else{ | |
1813 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
1814 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
1815 } | |
1816 | |
1817 ### printing methylated C percentage (Unknown C context) if applicable | |
1818 if ($bowtie2){ | |
1819 if ($percent_meC_unknown){ | |
1820 warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
1821 print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
1822 } | |
1823 else{ | |
1824 warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; | |
1825 print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; | |
1826 } | |
1827 } | |
1828 print REPORT "\n\n"; | |
1829 warn "\n\n"; | |
1830 | |
1831 if ($seqID_contains_tabs){ | |
1832 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; | |
1833 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; | |
1834 } | |
1835 } | |
1836 | |
1837 | |
1838 sub print_final_analysis_report_paired_ends{ | |
1839 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid,$merge_multi) = @_; | |
1840 | |
1841 if ($merge_multi){ | |
1842 warn "Printing a final merged alignment report for all individual sub-reports\n\n"; | |
1843 } | |
1844 else{ | |
1845 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles | |
1846 if ($directional){ | |
1847 if ($G_to_A_infile_2){ | |
1848 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2"; | |
1849 if ($deletion_successful == 2){ | |
1850 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n"; | |
1851 } | |
1852 else{ | |
1853 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n"; | |
1854 } | |
1855 } | |
1856 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete | |
1857 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1"; | |
1858 if ($deletion_successful == 1){ | |
1859 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n"; | |
1860 } | |
1861 else{ | |
1862 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n"; | |
1863 } | |
1864 } | |
1865 } | |
1866 else{ | |
1867 if ($G_to_A_infile_2 and $C_to_T_infile_2){ | |
1868 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2"; | |
1869 if ($deletion_successful == 4){ | |
1870 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n"; | |
1871 } | |
1872 else{ | |
1873 warn "Could not delete temporary files properly: $!\n"; | |
1874 } | |
1875 } | |
1876 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete | |
1877 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1"; | |
1878 if ($deletion_successful == 2){ | |
1879 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n"; | |
1880 } | |
1881 else{ | |
1882 warn "Could not delete temporary files properly: $!\n"; | |
1883 } | |
1884 } | |
1885 } | |
1886 } | |
1887 | |
1888 ### printing a final report for the alignment procedure | |
1889 warn "Final Alignment report\n",'='x22,"\n"; | |
1890 print REPORT "Final Alignment report\n",'='x22,"\n"; | |
1891 # foreach my $index (0..$#fhs){ | |
1892 # print "$fhs[$index]->{name}\n"; | |
1893 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; | |
1894 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; | |
1895 # } | |
1896 | |
1897 ### printing a final report for the methylation call procedure | |
1898 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; | |
1899 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; | |
1900 | |
1901 my $percent_alignable_sequence_pairs; | |
1902 if ($counting{sequences_count} == 0){ | |
1903 $percent_alignable_sequence_pairs = 0; | |
1904 } | |
1905 else{ | |
1906 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); | |
1907 } | |
1908 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n"; | |
1909 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n"; | |
1910 | |
1911 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
1912 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
1913 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
1914 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; | |
1915 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; | |
1916 | |
1917 | |
1918 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
1919 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
1920 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
1921 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; | |
1922 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; | |
1923 ### detailed information about Cs analysed | |
1924 | |
1925 if ($directional){ | |
1926 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
1927 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
1928 } | |
1929 | |
1930 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1931 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1932 | |
1933 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
1934 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1935 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1936 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1937 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
1938 if ($bowtie2){ | |
1939 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; | |
1940 } | |
1941 warn "\n"; | |
1942 | |
1943 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1944 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1945 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
1946 if ($bowtie2){ | |
1947 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
1948 } | |
1949 warn "\n"; | |
1950 | |
1951 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1952 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1953 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1954 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
1955 if ($bowtie2){ | |
1956 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; | |
1957 } | |
1958 print REPORT "\n"; | |
1959 | |
1960 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1961 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1962 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
1963 if ($bowtie2){ | |
1964 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; | |
1965 } | |
1966 print REPORT "\n"; | |
1967 | |
1968 my $percent_meCHG; | |
1969 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
1970 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
1971 } | |
1972 | |
1973 my $percent_meCHH; | |
1974 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
1975 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
1976 } | |
1977 | |
1978 my $percent_meCpG; | |
1979 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
1980 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
1981 } | |
1982 | |
1983 my $percent_meC_unknown; | |
1984 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ | |
1985 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); | |
1986 } | |
1987 | |
1988 | |
1989 ### printing methylated CpG percentage if applicable | |
1990 if ($percent_meCpG){ | |
1991 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1992 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1993 } | |
1994 else{ | |
1995 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1996 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1997 } | |
1998 | |
1999 ### printing methylated C percentage in CHG context if applicable | |
2000 if ($percent_meCHG){ | |
2001 warn "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
2002 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
2003 } | |
2004 else{ | |
2005 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
2006 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
2007 } | |
2008 | |
2009 ### printing methylated C percentage in CHH context if applicable | |
2010 if ($percent_meCHH){ | |
2011 warn "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
2012 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
2013 } | |
2014 else{ | |
2015 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
2016 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
2017 } | |
2018 | |
2019 ### printing methylated C percentage (Unknown C context) if applicable | |
2020 if ($bowtie2){ | |
2021 if ($percent_meC_unknown){ | |
2022 warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
2023 print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
2024 } | |
2025 else{ | |
2026 warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; | |
2027 print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; | |
2028 } | |
2029 } | |
2030 print REPORT "\n\n"; | |
2031 warn "\n\n"; | |
2032 | |
2033 } | |
2034 | |
2035 sub process_single_end_fastA_file_for_methylation_call{ | |
2036 my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; | |
2037 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. | |
2038 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either | |
2039 ### the C->T or G->A version | |
2040 | |
2041 ### gzipped version of the infile | |
2042 if ($sequence_file =~ /\.gz$/){ | |
2043 open (IN,"zcat $sequence_file |") or die $!; | |
2044 } | |
2045 else{ | |
2046 open (IN,$sequence_file) or die $!; | |
2047 } | |
2048 | |
2049 my $count = 0; | |
2050 | |
2051 warn "\nReading in the sequence file $sequence_file\n"; | |
2052 while (1) { | |
2053 # last if ($counting{sequences_count} > 100); | |
2054 my $identifier = <IN>; | |
2055 my $sequence = <IN>; | |
2056 last unless ($identifier and $sequence); | |
2057 | |
2058 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
2059 | |
2060 ++$count; | |
2061 | |
2062 if ($skip){ | |
2063 next unless ($count > $skip); | |
2064 } | |
2065 if ($upto){ | |
2066 last if ($count > $upto); | |
2067 } | |
2068 | |
2069 $counting{sequences_count}++; | |
2070 if ($counting{sequences_count}%1000000==0) { | |
2071 warn "Processed $counting{sequences_count} sequences so far\n"; | |
2072 } | |
2073 chomp $sequence; | |
2074 chomp $identifier; | |
2075 | |
2076 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers | |
2077 | |
2078 my $return; | |
2079 if ($bowtie2){ | |
2080 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier); | |
2081 } | |
2082 else{ | |
2083 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1 | |
2084 } | |
2085 | |
2086 unless ($return){ | |
2087 $return = 0; | |
2088 } | |
2089 | |
2090 # print the sequence to ambiguous.out if --ambiguous was specified | |
2091 if ($ambiguous and $return == 2){ | |
2092 print AMBIG ">$identifier\n"; | |
2093 print AMBIG "$sequence\n"; | |
2094 } | |
2095 | |
2096 # print the sequence to <unmapped.out> file if --un was specified | |
2097 elsif ($unmapped and $return == 1){ | |
2098 print UNMAPPED ">$identifier\n"; | |
2099 print UNMAPPED "$sequence\n"; | |
2100 } | |
2101 } | |
2102 print "Processed $counting{sequences_count} sequences in total\n\n"; | |
2103 | |
2104 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid); | |
2105 | |
2106 } | |
2107 | |
2108 sub process_single_end_fastQ_file_for_methylation_call{ | |
2109 | |
2110 my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; | |
2111 | |
2112 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. | |
2113 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either | |
2114 ### the C->T or G->A version | |
2115 | |
2116 ### gzipped version of the infile | |
2117 if ($sequence_file =~ /\.gz$/){ | |
2118 open (IN,"zcat $sequence_file |") or die $!; | |
2119 } | |
2120 else{ | |
2121 open (IN,$sequence_file) or die $!; | |
2122 } | |
2123 | |
2124 my $count = 0; | |
2125 | |
2126 warn "\nReading in the sequence file $sequence_file\n"; | |
2127 while (1) { | |
2128 my $identifier = <IN>; | |
2129 my $sequence = <IN>; | |
2130 my $identifier_2 = <IN>; | |
2131 my $quality_value = <IN>; | |
2132 last unless ($identifier and $sequence and $identifier_2 and $quality_value); | |
2133 | |
2134 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
2135 | |
2136 ++$count; | |
2137 | |
2138 if ($skip){ | |
2139 next unless ($count > $skip); | |
2140 } | |
2141 if ($upto){ | |
2142 last if ($count > $upto); | |
2143 } | |
2144 | |
2145 $counting{sequences_count}++; | |
2146 | |
2147 if ($counting{sequences_count}%1000000==0) { | |
2148 warn "Processed $counting{sequences_count} sequences so far\n"; | |
2149 } | |
2150 chomp $sequence; | |
2151 chomp $identifier; | |
2152 chomp $quality_value; | |
2153 | |
2154 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers | |
2155 | |
2156 my $return; | |
2157 if ($bowtie2){ | |
2158 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value); | |
2159 } | |
2160 else{ | |
2161 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1 | |
2162 } | |
2163 | |
2164 unless ($return){ | |
2165 $return = 0; | |
2166 } | |
2167 | |
2168 # print the sequence to ambiguous.out if --ambiguous was specified | |
2169 if ($ambiguous and $return == 2){ | |
2170 print AMBIG "\@$identifier\n"; | |
2171 print AMBIG "$sequence\n"; | |
2172 print AMBIG $identifier_2; | |
2173 print AMBIG "$quality_value\n"; | |
2174 } | |
2175 | |
2176 # print the sequence to <unmapped.out> file if --un was specified | |
2177 elsif ($unmapped and $return == 1){ | |
2178 print UNMAPPED "\@$identifier\n"; | |
2179 print UNMAPPED "$sequence\n"; | |
2180 print UNMAPPED $identifier_2; | |
2181 print UNMAPPED "$quality_value\n"; | |
2182 } | |
2183 } | |
2184 print "Processed $counting{sequences_count} sequences in total\n\n"; | |
2185 | |
2186 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid); | |
2187 | |
2188 } | |
2189 | |
2190 sub process_fastA_files_for_paired_end_methylation_calls{ | |
2191 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; | |
2192 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to | |
2193 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping. | |
2194 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the | |
2195 ### converted genomes (either the C->T or G->A version) | |
2196 | |
2197 ### gzipped version of the infiles | |
2198 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ | |
2199 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; | |
2200 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; | |
2201 } | |
2202 else{ | |
2203 open (IN1,$sequence_file_1) or die $!; | |
2204 open (IN2,$sequence_file_2) or die $!; | |
2205 } | |
2206 | |
2207 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; | |
2208 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one | |
2209 | |
2210 my $count = 0; | |
2211 | |
2212 while (1) { | |
2213 # reading from the first input file | |
2214 my $identifier_1 = <IN1>; | |
2215 my $sequence_1 = <IN1>; | |
2216 # reading from the second input file | |
2217 my $identifier_2 = <IN2>; | |
2218 my $sequence_2 = <IN2>; | |
2219 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2); | |
2220 | |
2221 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces | |
2222 $identifier_2 = fix_IDs($identifier_2); | |
2223 | |
2224 ++$count; | |
2225 | |
2226 if ($skip){ | |
2227 next unless ($count > $skip); | |
2228 } | |
2229 if ($upto){ | |
2230 last if ($count > $upto); | |
2231 } | |
2232 | |
2233 $counting{sequences_count}++; | |
2234 if ($counting{sequences_count}%1000000==0) { | |
2235 warn "Processed $counting{sequences_count} sequence pairs so far\n"; | |
2236 } | |
2237 my $orig_identifier_1 = $identifier_1; | |
2238 my $orig_identifier_2 = $identifier_2; | |
2239 | |
2240 chomp $sequence_1; | |
2241 chomp $identifier_1; | |
2242 chomp $sequence_2; | |
2243 chomp $identifier_2; | |
2244 | |
2245 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers | |
2246 | |
2247 my $return; | |
2248 if ($bowtie2){ | |
2249 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1); | |
2250 } | |
2251 else{ | |
2252 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1); | |
2253 } | |
2254 | |
2255 unless ($return){ | |
2256 $return = 0; | |
2257 } | |
2258 | |
2259 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified | |
2260 if ($ambiguous and $return == 2){ | |
2261 print AMBIG_1 $orig_identifier_1; | |
2262 print AMBIG_1 "$sequence_1\n"; | |
2263 print AMBIG_2 $orig_identifier_2; | |
2264 print AMBIG_2 "$sequence_2\n"; | |
2265 } | |
2266 | |
2267 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified | |
2268 elsif ($unmapped and $return == 1){ | |
2269 print UNMAPPED_1 $orig_identifier_1; | |
2270 print UNMAPPED_1 "$sequence_1\n"; | |
2271 print UNMAPPED_2 $orig_identifier_2; | |
2272 print UNMAPPED_2 "$sequence_2\n"; | |
2273 } | |
2274 } | |
2275 | |
2276 warn "Processed $counting{sequences_count} sequences in total\n\n"; | |
2277 | |
2278 close OUT or die $!; | |
2279 | |
2280 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
2281 | |
2282 } | |
2283 | |
2284 sub process_fastQ_files_for_paired_end_methylation_calls{ | |
2285 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; | |
2286 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to | |
2287 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments. | |
2288 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both) | |
2289 ### of the converted genomes (either C->T or G->A version) | |
2290 | |
2291 ### gzipped version of the infiles | |
2292 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ | |
2293 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n"; | |
2294 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n"; | |
2295 } | |
2296 else{ | |
2297 open (IN1,$sequence_file_1) or die $!; | |
2298 open (IN2,$sequence_file_2) or die $!; | |
2299 } | |
2300 | |
2301 my $count = 0; | |
2302 | |
2303 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; | |
2304 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one | |
2305 while (1) { | |
2306 # reading from the first input file | |
2307 my $identifier_1 = <IN1>; | |
2308 my $sequence_1 = <IN1>; | |
2309 my $ident_1 = <IN1>; # not needed | |
2310 my $quality_value_1 = <IN1>; # not needed | |
2311 # reading from the second input file | |
2312 my $identifier_2 = <IN2>; | |
2313 my $sequence_2 = <IN2>; | |
2314 my $ident_2 = <IN2>; # not needed | |
2315 my $quality_value_2 = <IN2>; # not needed | |
2316 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2); | |
2317 | |
2318 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces | |
2319 $identifier_2 = fix_IDs($identifier_2); | |
2320 | |
2321 ++$count; | |
2322 | |
2323 if ($skip){ | |
2324 next unless ($count > $skip); | |
2325 } | |
2326 if ($upto){ | |
2327 last if ($count > $upto); | |
2328 } | |
2329 | |
2330 $counting{sequences_count}++; | |
2331 if ($counting{sequences_count}%1000000==0) { | |
2332 warn "Processed $counting{sequences_count} sequence pairs so far\n"; | |
2333 } | |
2334 | |
2335 my $orig_identifier_1 = $identifier_1; | |
2336 my $orig_identifier_2 = $identifier_2; | |
2337 | |
2338 chomp $sequence_1; | |
2339 chomp $identifier_1; | |
2340 chomp $sequence_2; | |
2341 chomp $identifier_2; | |
2342 chomp $quality_value_1; | |
2343 chomp $quality_value_2; | |
2344 | |
2345 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID | |
2346 | |
2347 my $return; | |
2348 if ($bowtie2){ | |
2349 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); | |
2350 } | |
2351 else{ | |
2352 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); | |
2353 } | |
2354 | |
2355 unless ($return){ | |
2356 $return = 0; | |
2357 } | |
2358 | |
2359 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified | |
2360 if ($ambiguous and $return == 2){ | |
2361 # seq_1 | |
2362 print AMBIG_1 $orig_identifier_1; | |
2363 print AMBIG_1 "$sequence_1\n"; | |
2364 print AMBIG_1 $ident_1; | |
2365 print AMBIG_1 "$quality_value_1\n"; | |
2366 # seq_2 | |
2367 print AMBIG_2 $orig_identifier_2; | |
2368 print AMBIG_2 "$sequence_2\n"; | |
2369 print AMBIG_2 $ident_2; | |
2370 print AMBIG_2 "$quality_value_2\n"; | |
2371 } | |
2372 | |
2373 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified | |
2374 elsif ($unmapped and $return == 1){ | |
2375 # seq_1 | |
2376 print UNMAPPED_1 $orig_identifier_1; | |
2377 print UNMAPPED_1 "$sequence_1\n"; | |
2378 print UNMAPPED_1 $ident_1; | |
2379 print UNMAPPED_1 "$quality_value_1\n"; | |
2380 # seq_2 | |
2381 print UNMAPPED_2 $orig_identifier_2; | |
2382 print UNMAPPED_2 "$sequence_2\n"; | |
2383 print UNMAPPED_2 $ident_2; | |
2384 print UNMAPPED_2 "$quality_value_2\n"; | |
2385 } | |
2386 } | |
2387 | |
2388 warn "Processed $counting{sequences_count} sequences in total\n\n"; | |
2389 | |
2390 close OUT or die $!; | |
2391 | |
2392 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
2393 | |
2394 } | |
2395 | |
2396 sub check_bowtie_results_single_end{ | |
2397 my ($sequence,$identifier,$quality_value) = @_; | |
2398 | |
2399 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout | |
2400 $quality_value = 'I'x(length$sequence); | |
2401 } | |
2402 | |
2403 my %mismatches = (); | |
2404 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome | |
2405 foreach my $index (0..$#fhs){ | |
2406 | |
2407 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
2408 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); | |
2409 ### if the sequence we are currently looking at produced an alignment we are doing various things with it | |
2410 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
2411 ############################################################### | |
2412 ### STEP I Now processing the alignment stored in last_line ### | |
2413 ############################################################### | |
2414 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier); | |
2415 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation | |
2416 ### we only continue to extract useful information about this alignment if 1 was returned | |
2417 if ($valid_alignment_found_1 == 1){ | |
2418 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself | |
2419 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) | |
2420 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; | |
2421 | |
2422 unless($mismatch_info){ | |
2423 $mismatch_info = ''; | |
2424 } | |
2425 | |
2426 chomp $mismatch_info; | |
2427 my $chromosome; | |
2428 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ | |
2429 $chromosome = $mapped_chromosome; | |
2430 } | |
2431 else{ | |
2432 die "Chromosome number extraction failed for $mapped_chromosome\n"; | |
2433 } | |
2434 ### Now extracting the number of mismatches to the converted genome | |
2435 my $number_of_mismatches; | |
2436 if ($mismatch_info eq ''){ | |
2437 $number_of_mismatches = 0; | |
2438 } | |
2439 elsif ($mismatch_info =~ /^\d/){ | |
2440 my @mismatches = split (/,/,$mismatch_info); | |
2441 $number_of_mismatches = scalar @mismatches; | |
2442 } | |
2443 else{ | |
2444 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n"; | |
2445 } | |
2446 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
2447 my $alignment_location = join (":",$chromosome,$position); | |
2448 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
2449 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same | |
2450 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index | |
2451 ### number for the found alignment) | |
2452 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ | |
2453 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; | |
2454 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; | |
2455 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; | |
2456 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; | |
2457 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; | |
2458 } | |
2459 $number_of_mismatches = undef; | |
2460 ################################################################################################################################################## | |
2461 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a | |
2462 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will | |
2463 ### be returned as $valid_alignment_found and it will then be processed in the next round only. | |
2464 ################################################################################################################################################## | |
2465 my $newline = $fhs[$index]->{fh}-> getline(); | |
2466 if ($newline){ | |
2467 my ($seq_id) = split (/\t/,$newline); | |
2468 $fhs[$index]->{last_seq_id} = $seq_id; | |
2469 $fhs[$index]->{last_line} = $newline; | |
2470 } | |
2471 else { | |
2472 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) | |
2473 $fhs[$index]->{last_seq_id} = undef; | |
2474 $fhs[$index]->{last_line} = undef; | |
2475 next; | |
2476 } | |
2477 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier); | |
2478 ### we only continue to extract useful information about this second alignment if 1 was returned | |
2479 if ($valid_alignment_found_2 == 1){ | |
2480 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself | |
2481 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) | |
2482 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; | |
2483 unless($mismatch_info){ | |
2484 $mismatch_info = ''; | |
2485 } | |
2486 chomp $mismatch_info; | |
2487 | |
2488 my $chromosome; | |
2489 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ | |
2490 $chromosome = $mapped_chromosome; | |
2491 } | |
2492 else{ | |
2493 die "Chromosome number extraction failed for $mapped_chromosome\n"; | |
2494 } | |
2495 | |
2496 ### Now extracting the number of mismatches to the converted genome | |
2497 my $number_of_mismatches; | |
2498 if ($mismatch_info eq ''){ | |
2499 $number_of_mismatches = 0; | |
2500 } | |
2501 elsif ($mismatch_info =~ /^\d/){ | |
2502 my @mismatches = split (/,/,$mismatch_info); | |
2503 $number_of_mismatches = scalar @mismatches; | |
2504 } | |
2505 else{ | |
2506 die "Something weird is going on with the mismatch field\n"; | |
2507 } | |
2508 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
2509 ### extracting the chromosome number from the bowtie output (see above) | |
2510 my $alignment_location = join (":",$chromosome,$position); | |
2511 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position | |
2512 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this | |
2513 ### case we are not writing the same entry out a second time. | |
2514 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ | |
2515 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; | |
2516 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; | |
2517 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; | |
2518 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; | |
2519 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; | |
2520 } | |
2521 #################################################################################################################################### | |
2522 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ### | |
2523 #################################################################################################################################### | |
2524 $newline = $fhs[$index]->{fh}-> getline(); | |
2525 if ($newline){ | |
2526 my ($seq_id) = split (/\t/,$newline); | |
2527 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier); | |
2528 $fhs[$index]->{last_seq_id} = $seq_id; | |
2529 $fhs[$index]->{last_line} = $newline; | |
2530 next; | |
2531 } | |
2532 else { | |
2533 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) | |
2534 $fhs[$index]->{last_seq_id} = undef; | |
2535 $fhs[$index]->{last_line} = undef; | |
2536 next; | |
2537 } | |
2538 ### still within the 2nd sequence in correct orientation found | |
2539 } | |
2540 ### still withing the 1st sequence in correct orientation found | |
2541 } | |
2542 ### still within the if (last_seq_id eq identifier) condition | |
2543 } | |
2544 ### still within foreach index loop | |
2545 } | |
2546 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file | |
2547 unless(%mismatches){ | |
2548 $counting{no_single_alignment_found}++; | |
2549 if ($unmapped){ | |
2550 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified | |
2551 } | |
2552 else{ | |
2553 return; | |
2554 } | |
2555 } | |
2556 ####################################################################################################################################################### | |
2557 ####################################################################################################################################################### | |
2558 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### | |
2559 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### | |
2560 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ### | |
2561 ####################################################################################################################################################### | |
2562 ####################################################################################################################################################### | |
2563 ### Going to use the variable $sequence_fails as a memory if a sequence could not be aligned uniquely (set to 1 then) | |
2564 my $sequence_fails = 0; | |
2565 ### Declaring an empty hash reference which will store all information we need for the methylation call | |
2566 my $methylation_call_params; # hash reference! | |
2567 ### sorting in ascending order | |
2568 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ | |
2569 | |
2570 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment | |
2571 if (scalar keys %{$mismatches{$mismatch_number}} == 1){ | |
2572 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ | |
2573 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence}; | |
2574 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; | |
2575 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position}; | |
2576 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; | |
2577 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; | |
2578 } | |
2579 } | |
2580 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){ | |
2581 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and | |
2582 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a | |
2583 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome | |
2584 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite | |
2585 ### reaction. E.g. | |
2586 ### CAGTCACGCGCGCGCG will become | |
2587 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition. | |
2588 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave | |
2589 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!: | |
2590 ### G->A conversion: | |
2591 ### highly methylated: CAATCACACACACACA | |
2592 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce | |
2593 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the | |
2594 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted | |
2595 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts. | |
2596 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions | |
2597 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of | |
2598 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment. | |
2599 ### In the above example the number of transliterations required to transform the actual sequence | |
2600 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment) | |
2601 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments) | |
2602 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments | |
2603 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed | |
2604 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be | |
2605 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded. | |
2606 my @three_candidate_seqs; | |
2607 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){ | |
2608 my $transliterations_performed; | |
2609 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){ | |
2610 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT'); | |
2611 } | |
2612 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){ | |
2613 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA'); | |
2614 } | |
2615 else{ | |
2616 die "unexpected index number range $!\n"; | |
2617 } | |
2618 push @three_candidate_seqs,{ | |
2619 index =>$mismatches{$mismatch_number}->{$composite_location}->{index}, | |
2620 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence}, | |
2621 mismatch_number => $mismatch_number, | |
2622 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome}, | |
2623 position => $mismatches{$mismatch_number}->{$composite_location}->{position}, | |
2624 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id}, | |
2625 transliterations_performed => $transliterations_performed, | |
2626 }; | |
2627 } | |
2628 ### sorting in ascending order for the lowest number of transliterations performed | |
2629 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs; | |
2630 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed}; | |
2631 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed}; | |
2632 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed}; | |
2633 # print "$first_array_element\t$second_array_element\t$third_array_element\n"; | |
2634 if (($first_array_element*2) < $second_array_element){ | |
2635 $counting{low_complexity_alignments_overruled_count}++; | |
2636 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits | |
2637 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence}; | |
2638 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome}; | |
2639 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position}; | |
2640 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index}; | |
2641 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; | |
2642 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n"; | |
2643 } | |
2644 else{ | |
2645 $sequence_fails = 1; | |
2646 } | |
2647 } | |
2648 else{ | |
2649 $sequence_fails = 1; | |
2650 } | |
2651 ### after processing the alignment with the lowest number of mismatches we exit | |
2652 last; | |
2653 } | |
2654 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions | |
2655 if ($sequence_fails == 1){ | |
2656 $counting{unsuitable_sequence_count}++; | |
2657 if ($ambiguous){ | |
2658 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified | |
2659 } | |
2660 if ($unmapped){ | |
2661 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified | |
2662 } | |
2663 else{ | |
2664 return 0; # => exits to next sequence (default) | |
2665 } | |
2666 } | |
2667 | |
2668 ### --DIRECTIONAL | |
2669 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
2670 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
2671 if ($directional){ | |
2672 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ | |
2673 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
2674 $counting{alignments_rejected_count}++; | |
2675 return 0; | |
2676 } | |
2677 } | |
2678 | |
2679 ### If the sequence has not been rejected so far it will have a unique best alignment | |
2680 $counting{unique_best_alignment_count}++; | |
2681 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params); | |
2682 | |
2683 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call | |
2684 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ | |
2685 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; | |
2686 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
2687 return 0; | |
2688 } | |
2689 | |
2690 ### otherwise we are set to perform the actual methylation call | |
2691 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); | |
2692 | |
2693 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value); | |
2694 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out | |
2695 } | |
2696 | |
2697 sub check_bowtie_results_single_end_bowtie2{ | |
2698 my ($sequence,$identifier,$quality_value) = @_; | |
2699 | |
2700 | |
2701 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout | |
2702 $quality_value = 'I'x(length$sequence); | |
2703 } | |
2704 | |
2705 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name. | |
2706 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs | |
2707 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n"; | |
2708 | |
2709 my $alignment_ambiguous = 0; | |
2710 my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far | |
2711 my $amb_same_thread = 0; ## if a reads primary and secondary alignments have the same alignment score we set this to true. | |
2712 | |
2713 my %alignments = (); | |
2714 | |
2715 ### reading from the Bowtie 2 output filehandles | |
2716 foreach my $index (0..$#fhs){ | |
2717 # print "Index: $index\n"; | |
2718 # print "$fhs[$index]->{last_line}\n"; | |
2719 # print "$fhs[$index]->{last_seq_id}\n"; | |
2720 # sleep (1); | |
2721 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
2722 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); | |
2723 | |
2724 ### if the sequence we are currently looking at produced an alignment we are doing various things with it | |
2725 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n"; | |
2726 | |
2727 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
2728 # SAM format specifications for Bowtie 2 | |
2729 # (1) Name of read that aligned | |
2730 # (2) Sum of all applicable flags. Flags relevant to Bowtie are: | |
2731 # 1 The read is one of a pair | |
2732 # 2 The alignment is one end of a proper paired-end alignment | |
2733 # 4 The read has no reported alignments | |
2734 # 8 The read is one of a pair and has no reported alignments | |
2735 # 16 The alignment is to the reverse reference strand | |
2736 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand | |
2737 # 64 The read is mate 1 in a pair | |
2738 # 128 The read is mate 2 in a pair | |
2739 # 256 The read has multiple mapping states | |
2740 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) | |
2741 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) | |
2742 # (5) Mapping quality (255 means MAPQ is not available) | |
2743 # (6) CIGAR string representation of alignment (* if unavailable) | |
2744 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. | |
2745 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. | |
2746 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. | |
2747 # (10) Read sequence (reverse-complemented if aligned to the reverse strand) | |
2748 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. | |
2749 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: | |
2750 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. | |
2751 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. | |
2752 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. | |
2753 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. | |
2754 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read. | |
2755 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
2756 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
2757 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. | |
2758 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. | |
2759 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. | |
2760 | |
2761 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10]; | |
2762 | |
2763 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance | |
2764 if ($flag == 4){ | |
2765 ## reading in the next alignment, which must be the next sequence | |
2766 my $newline = $fhs[$index]->{fh}-> getline(); | |
2767 if ($newline){ | |
2768 chomp $newline; | |
2769 my ($seq_id) = split (/\t/,$newline); | |
2770 $fhs[$index]->{last_seq_id} = $seq_id; | |
2771 $fhs[$index]->{last_line} = $newline; | |
2772 if ($seq_id eq $identifier){ | |
2773 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; | |
2774 } | |
2775 next; # next instance | |
2776 } | |
2777 else{ | |
2778 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
2779 $fhs[$index]->{last_seq_id} = undef; | |
2780 $fhs[$index]->{last_line} = undef; | |
2781 next; | |
2782 } | |
2783 } | |
2784 | |
2785 # if there are one or more proper alignments we can extract the chromosome number | |
2786 my $chromosome; | |
2787 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ | |
2788 $chromosome = $mapped_chromosome; | |
2789 } | |
2790 else{ | |
2791 die "Chromosome number extraction failed for $mapped_chromosome\n"; | |
2792 } | |
2793 | |
2794 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string | |
2795 my ($alignment_score,$second_best,$MD_tag); | |
2796 my @fields = split (/\t/,$fhs[$index]->{last_line}); | |
2797 | |
2798 foreach (11..$#fields){ | |
2799 if ($fields[$_] =~ /AS:i:(.*)/){ | |
2800 $alignment_score = $1; | |
2801 } | |
2802 elsif ($fields[$_] =~ /XS:i:(.*)/){ | |
2803 $second_best = $1; | |
2804 } | |
2805 elsif ($fields[$_] =~ /MD:Z:(.*)/){ | |
2806 $MD_tag = $1; | |
2807 } | |
2808 } | |
2809 | |
2810 if (!defined $best_AS_so_far){ | |
2811 $best_AS_so_far = $alignment_score; | |
2812 # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n"; | |
2813 } | |
2814 else{ | |
2815 if ($alignment_score > $best_AS_so_far){ # AS are generally negative with a maximum of 0 | |
2816 $best_AS_so_far = $alignment_score; | |
2817 # warn "Found better alignment score ($alignment_score), setting \$best_AS_so_far to $best_AS_so_far\n"; | |
2818 # resetting the ambiguous within thread memory (if applicable at all) | |
2819 # warn "Resetting amb within thread value to 0\n"; | |
2820 $amb_same_thread = 0; | |
2821 } | |
2822 else{ | |
2823 # warn "current alignment (AS $alignment_score) isn't better than the best so far ($best_AS_so_far). Not changing anything\n"; | |
2824 } | |
2825 } | |
2826 | |
2827 # warn "First best alignment_score is: '$alignment_score'\n"; | |
2828 # warn "MD tag is: '$MD_tag'\n"; | |
2829 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag) from line $fhs[$index]->{last_line}!\n" unless (defined $alignment_score and defined $MD_tag); | |
2830 | |
2831 if (defined $second_best){ | |
2832 # warn "second best alignment_score is: '$second_best'\n\n"; | |
2833 | |
2834 # If the first alignment score is the same as the alignment score of the second best hit we keep a memory of this | |
2835 if ($alignment_score == $second_best){ | |
2836 | |
2837 # checking to see if this read produced the best alignment | |
2838 if ($alignment_score == $best_AS_so_far){ # yes this read is the best one so far, however it is ambiguous | |
2839 # warn "Read is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n"; | |
2840 $amb_same_thread = 1; | |
2841 } | |
2842 else{ | |
2843 # warn "This read has a worse alignments score than the best alignment so far and will be ignored even though it is ambiguous in itself\n"; | |
2844 } | |
2845 ### if there is a better alignment later on -> fine. If not, the read will get booted altogether | |
2846 | |
2847 ## need to read and discard all additional ambiguous reads until we reach the next sequence | |
2848 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
2849 my $newline = $fhs[$index]->{fh}-> getline(); | |
2850 if ($newline){ | |
2851 chomp $newline; | |
2852 my ($seq_id) = split (/\t/,$newline); | |
2853 $fhs[$index]->{last_seq_id} = $seq_id; | |
2854 $fhs[$index]->{last_line} = $newline; | |
2855 } | |
2856 else{ | |
2857 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
2858 $fhs[$index]->{last_seq_id} = undef; | |
2859 $fhs[$index]->{last_line} = undef; | |
2860 last; # break free in case we have reached the end of the alignment output | |
2861 } | |
2862 } | |
2863 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; | |
2864 } | |
2865 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment | |
2866 | |
2867 my $alignment_location = join (":",$chromosome,$position); | |
2868 | |
2869 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
2870 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
2871 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
2872 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB | |
2873 | |
2874 unless (exists $alignments{$alignment_location}){ | |
2875 $alignments{$alignment_location}->{seq_id} = $id; | |
2876 $alignments{$alignment_location}->{alignment_score} = $alignment_score; | |
2877 $alignments{$alignment_location}->{alignment_score_second_best} = $second_best; | |
2878 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; | |
2879 $alignments{$alignment_location}->{index} = $index; | |
2880 $alignments{$alignment_location}->{chromosome} = $chromosome; | |
2881 $alignments{$alignment_location}->{position} = $position; | |
2882 $alignments{$alignment_location}->{CIGAR} = $cigar; | |
2883 $alignments{$alignment_location}->{MD_tag} = $MD_tag; | |
2884 } | |
2885 | |
2886 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence | |
2887 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
2888 my $newline = $fhs[$index]->{fh}-> getline(); | |
2889 if ($newline){ | |
2890 chomp $newline; | |
2891 my ($seq_id) = split (/\t/,$newline); | |
2892 $fhs[$index]->{last_seq_id} = $seq_id; | |
2893 $fhs[$index]->{last_line} = $newline; | |
2894 } | |
2895 else{ | |
2896 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
2897 $fhs[$index]->{last_seq_id} = undef; | |
2898 $fhs[$index]->{last_line} = undef; | |
2899 last; # break free in case we have reached the end of the alignment output | |
2900 } | |
2901 } | |
2902 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; | |
2903 } | |
2904 } | |
2905 else{ # there is no second best hit, so we can just store this one and read in the next sequence | |
2906 | |
2907 my $alignment_location = join (":",$chromosome,$position); | |
2908 | |
2909 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
2910 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
2911 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
2912 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB | |
2913 | |
2914 unless (exists $alignments{$alignment_location}){ | |
2915 $alignments{$alignment_location}->{seq_id} = $id; | |
2916 $alignments{$alignment_location}->{alignment_score} = $alignment_score; | |
2917 $alignments{$alignment_location}->{alignment_score_second_best} = undef; | |
2918 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; | |
2919 $alignments{$alignment_location}->{index} = $index; | |
2920 $alignments{$alignment_location}->{chromosome} = $chromosome; | |
2921 $alignments{$alignment_location}->{position} = $position; | |
2922 $alignments{$alignment_location}->{MD_tag} = $MD_tag; | |
2923 $alignments{$alignment_location}->{CIGAR} = $cigar; | |
2924 } | |
2925 | |
2926 my $newline = $fhs[$index]->{fh}-> getline(); | |
2927 if ($newline){ | |
2928 chomp $newline; | |
2929 my ($seq_id) = split (/\t/,$newline); | |
2930 $fhs[$index]->{last_seq_id} = $seq_id; | |
2931 $fhs[$index]->{last_line} = $newline; | |
2932 if ($seq_id eq $identifier){ | |
2933 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; | |
2934 } | |
2935 } | |
2936 else{ | |
2937 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
2938 $fhs[$index]->{last_seq_id} = undef; | |
2939 $fhs[$index]->{last_line} = undef; | |
2940 } | |
2941 } | |
2942 } | |
2943 } | |
2944 | |
2945 ### If there were several equally good alignments for the best alignment score we will boot the read | |
2946 if ($amb_same_thread){ | |
2947 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; | |
2948 $alignment_ambiguous = 1; | |
2949 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; | |
2950 } | |
2951 else{ | |
2952 # warn "alignment won't be considered ambiguous. This time....\n"; | |
2953 } | |
2954 | |
2955 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out. | |
2956 if ($alignment_ambiguous == 1){ | |
2957 $counting{unsuitable_sequence_count}++; | |
2958 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
2959 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); | |
2960 # print "$ambiguous_read_output\n"; | |
2961 | |
2962 if ($ambiguous){ | |
2963 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified | |
2964 } | |
2965 elsif ($unmapped){ | |
2966 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified | |
2967 } | |
2968 else{ | |
2969 return 0; | |
2970 } | |
2971 } | |
2972 | |
2973 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file | |
2974 unless(%alignments){ | |
2975 $counting{no_single_alignment_found}++; | |
2976 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value); | |
2977 # print "$unmapped_read_output\n"; | |
2978 if ($unmapped){ | |
2979 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified | |
2980 } | |
2981 else{ | |
2982 return 0; # default | |
2983 } | |
2984 } | |
2985 | |
2986 ####################################################################################################################################################### | |
2987 | |
2988 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one | |
2989 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) | |
2990 ### alignment score we are discarding the sequence altogether. | |
2991 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for | |
2992 ### opening (5) and extending (3 per bp) the gap. | |
2993 | |
2994 ####################################################################################################################################################### | |
2995 | |
2996 my $methylation_call_params; # hash reference which will store all information we need for the methylation call | |
2997 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
2998 | |
2999 ### print contents of %alignments for debugging | |
3000 # if (scalar keys %alignments > 1){ | |
3001 # print "\n******\n"; | |
3002 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ | |
3003 # print "Loc: $alignment_location\n"; | |
3004 # print "ID: $alignments{$alignment_location}->{seq_id}\n"; | |
3005 # print "AS: $alignments{$alignment_location}->{alignment_score}\n"; | |
3006 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n"; | |
3007 # print "Index $alignments{$alignment_location}->{index}\n"; | |
3008 # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; | |
3009 # print "pos: $alignments{$alignment_location}->{position}\n"; | |
3010 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n"; | |
3011 # } | |
3012 # print "\n******\n"; | |
3013 # } | |
3014 | |
3015 ### if there is only 1 entry in the hash with we accept it as the best alignment | |
3016 if (scalar keys %alignments == 1){ | |
3017 for my $unique_best_alignment (keys %alignments){ | |
3018 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence}; | |
3019 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; | |
3020 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position}; | |
3021 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; | |
3022 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score}; | |
3023 $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$unique_best_alignment}->{alignment_score_second_best}; | |
3024 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag}; | |
3025 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR}; | |
3026 } | |
3027 } | |
3028 | |
3029 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case | |
3030 ### we boot the sequence altogether | |
3031 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ | |
3032 my $best_alignment_score; | |
3033 my $best_alignment_location; | |
3034 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){ | |
3035 # print "$alignments{$alignment_location}->{alignment_score}\n"; | |
3036 unless (defined $best_alignment_score){ | |
3037 $best_alignment_score = $alignments{$alignment_location}->{alignment_score}; | |
3038 $best_alignment_location = $alignment_location; | |
3039 # print "setting best alignment score: $best_alignment_score\n"; | |
3040 } | |
3041 else{ | |
3042 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted | |
3043 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){ | |
3044 # warn "Same alignment score, the sequence will get booted!\n"; | |
3045 $sequence_fails = 1; | |
3046 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments | |
3047 } | |
3048 ### else we are going to store the best alignment for further processing | |
3049 else{ | |
3050 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence}; | |
3051 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; | |
3052 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position}; | |
3053 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; | |
3054 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score}; | |
3055 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag}; | |
3056 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR}; | |
3057 if (defined $alignments{$best_alignment_location}->{alignment_score_second_best} and $alignments{$best_alignment_location}-> {alignment_score_second_best} > $alignments{$alignment_location}->{alignment_score}) { | |
3058 $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$best_alignment_location}->{alignment_score_second_best}; | |
3059 } | |
3060 else { | |
3061 $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$alignment_location}->{alignment_score}; | |
3062 } | |
3063 last; # exiting after processing the second alignment since the sequence produced a unique best alignment | |
3064 } | |
3065 } | |
3066 } | |
3067 } | |
3068 else{ | |
3069 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";; | |
3070 } | |
3071 | |
3072 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions | |
3073 if ($sequence_fails == 1){ | |
3074 $counting{unsuitable_sequence_count}++; | |
3075 | |
3076 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
3077 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); | |
3078 # print OUT "$ambiguous_read_output\n"; | |
3079 | |
3080 if ($ambiguous){ | |
3081 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified | |
3082 } | |
3083 elsif ($unmapped){ | |
3084 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified | |
3085 } | |
3086 else{ | |
3087 return 0; # => exits to next sequence (default) | |
3088 } | |
3089 } | |
3090 | |
3091 ### --DIRECTIONAL | |
3092 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
3093 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
3094 if ($directional){ | |
3095 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ | |
3096 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
3097 $counting{alignments_rejected_count}++; | |
3098 return 0; | |
3099 } | |
3100 } | |
3101 | |
3102 ### If the sequence has not been rejected so far it has a unique best alignment | |
3103 $counting{unique_best_alignment_count}++; | |
3104 | |
3105 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well | |
3106 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params); | |
3107 | |
3108 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call | |
3109 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ | |
3110 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; | |
3111 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
3112 return 0; | |
3113 } | |
3114 | |
3115 # Compute MAPQ value | |
3116 $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence), undef, | |
3117 $methylation_call_params->{$identifier}->{alignment_score}, | |
3118 $methylation_call_params->{$identifier}->{alignment_score_second_best}); | |
3119 | |
3120 | |
3121 | |
3122 ### otherwise we are set to perform the actual methylation call | |
3123 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); | |
3124 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value); | |
3125 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out | |
3126 } | |
3127 | |
3128 | |
3129 sub determine_number_of_transliterations_performed{ | |
3130 my ($sequence,$read_conversion) = @_; | |
3131 my $number_of_transliterations; | |
3132 if ($read_conversion eq 'CT'){ | |
3133 $number_of_transliterations = $sequence =~ tr/C/T/; | |
3134 } | |
3135 elsif ($read_conversion eq 'GA'){ | |
3136 $number_of_transliterations = $sequence =~ tr/G/A/; | |
3137 } | |
3138 else{ | |
3139 die "Read conversion mode of the read was not specified $!\n"; | |
3140 } | |
3141 return $number_of_transliterations; | |
3142 } | |
3143 | |
3144 sub decide_whether_single_end_alignment_is_valid{ | |
3145 my ($index,$identifier) = @_; | |
3146 | |
3147 # extracting from Bowtie 1 format | |
3148 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1]; | |
3149 | |
3150 ### ensuring that the entry is the correct sequence | |
3151 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){ | |
3152 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically | |
3153 ### sensible alignments | |
3154 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); | |
3155 ### If the orientation was correct can we move on | |
3156 if ($orientation == 1){ | |
3157 return 1; ### 1st possibility for a sequence to pass | |
3158 } | |
3159 ### If the alignment was in the wrong orientation we need to read in a new line | |
3160 elsif($orientation == 0){ | |
3161 my $newline = $fhs[$index]->{fh}->getline(); | |
3162 if ($newline){ | |
3163 ($id,$strand) = (split (/\t/,$newline))[0,1]; | |
3164 | |
3165 ### ensuring that the next entry is still the correct sequence | |
3166 if ($id eq $identifier){ | |
3167 ### checking orientation again | |
3168 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); | |
3169 ### If the orientation was correct can we move on | |
3170 if ($orientation == 1){ | |
3171 $fhs[$index]->{last_seq_id} = $id; | |
3172 $fhs[$index]->{last_line} = $newline; | |
3173 return 1; ### 2nd possibility for a sequence to pass | |
3174 } | |
3175 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs | |
3176 elsif ($orientation == 0){ | |
3177 $newline = $fhs[$index]->{fh}->getline(); | |
3178 if ($newline){ | |
3179 my ($seq_id) = split (/\t/,$newline); | |
3180 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with | |
3181 ### the same fields of the just read next entry | |
3182 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier); | |
3183 $fhs[$index]->{last_seq_id} = $seq_id; | |
3184 $fhs[$index]->{last_line} = $newline; | |
3185 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation | |
3186 } | |
3187 else{ | |
3188 # assigning undef to last_seq_id and last_line (end of bowtie output) | |
3189 $fhs[$index]->{last_seq_id} = undef; | |
3190 $fhs[$index]->{last_line} = undef; | |
3191 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation | |
3192 } | |
3193 } | |
3194 else{ | |
3195 die "The orientation of the alignment must be either correct or incorrect\n"; | |
3196 } | |
3197 } | |
3198 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs | |
3199 else{ | |
3200 $fhs[$index]->{last_seq_id} = $id; | |
3201 $fhs[$index]->{last_line} = $newline; | |
3202 return 0; # processing the new alignment result only in the next round | |
3203 } | |
3204 } | |
3205 else { | |
3206 # assigning undef to last_seq_id and last_line (end of bowtie output) | |
3207 $fhs[$index]->{last_seq_id} = undef; | |
3208 $fhs[$index]->{last_line} = undef; | |
3209 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation | |
3210 } | |
3211 } | |
3212 else{ | |
3213 die "The orientation of the alignment must be either correct or incorrect\n"; | |
3214 } | |
3215 } | |
3216 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round | |
3217 else{ | |
3218 return 0; | |
3219 } | |
3220 } | |
3221 ######################### | |
3222 ### BOWTIE 1 | PAIRED-END | |
3223 ######################### | |
3224 | |
3225 sub check_bowtie_results_paired_ends{ | |
3226 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; | |
3227 | |
3228 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 | |
3229 unless ($quality_value_1){ | |
3230 $quality_value_1 = 'I'x(length$sequence_1); | |
3231 } | |
3232 unless ($quality_value_2){ | |
3233 $quality_value_2 = 'I'x(length$sequence_2); | |
3234 } | |
3235 | |
3236 # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; | |
3237 # sleep (1); | |
3238 my %mismatches = (); | |
3239 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome | |
3240 | |
3241 | |
3242 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. | |
3243 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). | |
3244 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) | |
3245 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary | |
3246 ### strands are not being reported by specifying --directional | |
3247 | |
3248 foreach my $index (0,3,1,2){ | |
3249 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
3250 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); | |
3251 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it | |
3252 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
3253 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n"; | |
3254 | |
3255 ################################################################################## | |
3256 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ### | |
3257 ################################################################################## | |
3258 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); | |
3259 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong | |
3260 ### orientation. We only continue to extract useful information about this alignment if 1 was returned | |
3261 if ($valid_alignment_found == 1){ | |
3262 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself. | |
3263 ### we store the useful information in %mismatches | |
3264 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; | |
3265 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; | |
3266 chomp $mismatch_info_1; | |
3267 chomp $mismatch_info_2; | |
3268 | |
3269 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted | |
3270 my ($chromosome_1,$chromosome_2); | |
3271 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ | |
3272 $chromosome_1 = $mapped_chromosome_1; | |
3273 } | |
3274 else{ | |
3275 die "Chromosome number extraction failed for $mapped_chromosome_1\n"; | |
3276 } | |
3277 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ | |
3278 $chromosome_2 = $mapped_chromosome_2; | |
3279 } | |
3280 else{ | |
3281 die "Chromosome number extraction failed for $mapped_chromosome_2\n"; | |
3282 } | |
3283 | |
3284 ### Now extracting the number of mismatches to the converted genome | |
3285 my $number_of_mismatches_1; | |
3286 my $number_of_mismatches_2; | |
3287 if ($mismatch_info_1 eq ''){ | |
3288 $number_of_mismatches_1 = 0; | |
3289 } | |
3290 elsif ($mismatch_info_1 =~ /^\d/){ | |
3291 my @mismatches = split (/,/,$mismatch_info_1); | |
3292 $number_of_mismatches_1 = scalar @mismatches; | |
3293 } | |
3294 else{ | |
3295 die "Something weird is going on with the mismatch field\n"; | |
3296 } | |
3297 if ($mismatch_info_2 eq ''){ | |
3298 $number_of_mismatches_2 = 0; | |
3299 } | |
3300 elsif ($mismatch_info_2 =~ /^\d/){ | |
3301 my @mismatches = split (/,/,$mismatch_info_2); | |
3302 $number_of_mismatches_2 = scalar @mismatches; | |
3303 } | |
3304 else{ | |
3305 die "Something weird is going on with the mismatch field\n"; | |
3306 } | |
3307 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments | |
3308 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; | |
3309 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
3310 die "Position 1 is higher than position 2" if ($position_1 > $position_2); | |
3311 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); | |
3312 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
3313 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
3314 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same | |
3315 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index | |
3316 ### number for the found alignment) | |
3317 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ | |
3318 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine | |
3319 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; | |
3320 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; | |
3321 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; | |
3322 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine | |
3323 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; | |
3324 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; | |
3325 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; | |
3326 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; | |
3327 } | |
3328 ################################################################################################################################################### | |
3329 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ### | |
3330 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ### | |
3331 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ### | |
3332 ### this round ### | |
3333 ################################################################################################################################################### | |
3334 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
3335 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
3336 | |
3337 if ($newline_1 and $newline_2){ | |
3338 my ($seq_id_1) = split (/\t/,$newline_1); | |
3339 my ($seq_id_2) = split (/\t/,$newline_2); | |
3340 | |
3341 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
3342 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
3343 } | |
3344 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
3345 $fhs[$index]->{last_seq_id} = $seq_id_2; | |
3346 } | |
3347 else{ | |
3348 die "Either read 1 or read 2 needs to end on '/1'\n"; | |
3349 } | |
3350 | |
3351 $fhs[$index]->{last_line_1} = $newline_1; | |
3352 $fhs[$index]->{last_line_2} = $newline_2; | |
3353 } | |
3354 else { | |
3355 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) | |
3356 $fhs[$index]->{last_seq_id} = undef; | |
3357 $fhs[$index]->{last_line_1} = undef; | |
3358 $fhs[$index]->{last_line_2} = undef; | |
3359 next; # jumping to the next index | |
3360 } | |
3361 ### Now processing the entry we just stored in last_line_1 and last_line_2 | |
3362 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); | |
3363 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to | |
3364 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation | |
3365 if ($valid_alignment_found == 1){ | |
3366 ### we store the useful information in %mismatches | |
3367 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7]; | |
3368 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7]; | |
3369 chomp $mismatch_info_1; | |
3370 chomp $mismatch_info_2; | |
3371 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted) | |
3372 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ | |
3373 $chromosome_1 = $mapped_chromosome_1; | |
3374 } | |
3375 else{ | |
3376 die "Chromosome number extraction failed for $mapped_chromosome_1\n"; | |
3377 } | |
3378 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ | |
3379 $chromosome_2 = $mapped_chromosome_2; | |
3380 } | |
3381 else{ | |
3382 die "Chromosome number extraction failed for $mapped_chromosome_2\n"; | |
3383 } | |
3384 | |
3385 $number_of_mismatches_1=''; | |
3386 $number_of_mismatches_2=''; | |
3387 ### Now extracting the number of mismatches to the converted genome | |
3388 if ($mismatch_info_1 eq ''){ | |
3389 $number_of_mismatches_1 = 0; | |
3390 } | |
3391 elsif ($mismatch_info_1 =~ /^\d/){ | |
3392 my @mismatches = split (/,/,$mismatch_info_1); | |
3393 $number_of_mismatches_1 = scalar @mismatches; | |
3394 } | |
3395 else{ | |
3396 die "Something weird is going on with the mismatch field\n"; | |
3397 } | |
3398 if ($mismatch_info_2 eq ''){ | |
3399 $number_of_mismatches_2 = 0; | |
3400 } | |
3401 elsif ($mismatch_info_2 =~ /^\d/){ | |
3402 my @mismatches = split (/,/,$mismatch_info_2); | |
3403 $number_of_mismatches_2 = scalar @mismatches; | |
3404 } | |
3405 else{ | |
3406 die "Something weird is going on with the mismatch field\n"; | |
3407 } | |
3408 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments | |
3409 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; | |
3410 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
3411 die "position 1 is greater than position 2" if ($position_1 > $position_2); | |
3412 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); | |
3413 $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
3414 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
3415 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same | |
3416 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index | |
3417 ### number for the found alignment) | |
3418 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ | |
3419 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine | |
3420 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; | |
3421 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; | |
3422 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; | |
3423 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine | |
3424 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; | |
3425 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; | |
3426 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; | |
3427 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; | |
3428 } | |
3429 ############################################################################################################################################### | |
3430 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ### | |
3431 ############################################################################################################################################### | |
3432 $newline_1 = $fhs[$index]->{fh}-> getline(); | |
3433 $newline_2 = $fhs[$index]->{fh}-> getline(); | |
3434 | |
3435 if ($newline_1 and $newline_2){ | |
3436 my ($seq_id_1) = split (/\t/,$newline_1); | |
3437 my ($seq_id_2) = split (/\t/,$newline_2); | |
3438 | |
3439 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
3440 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
3441 } | |
3442 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
3443 $fhs[$index]->{last_seq_id} = $seq_id_2; | |
3444 } | |
3445 $fhs[$index]->{last_line_1} = $newline_1; | |
3446 $fhs[$index]->{last_line_2} = $newline_2; | |
3447 } | |
3448 else { | |
3449 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) | |
3450 $fhs[$index]->{last_seq_id} = undef; | |
3451 $fhs[$index]->{last_line_1} = undef; | |
3452 $fhs[$index]->{last_line_2} = undef; | |
3453 next; # jumping to the next index | |
3454 } | |
3455 ### within the 2nd sequence pair alignment in correct orientation found | |
3456 } | |
3457 ### within the 1st sequence pair alignment in correct orientation found | |
3458 } | |
3459 ### still within the (last_seq_id eq identifier) condition | |
3460 } | |
3461 ### still within foreach index loop | |
3462 } | |
3463 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file | |
3464 unless(%mismatches){ | |
3465 $counting{no_single_alignment_found}++; | |
3466 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified | |
3467 } | |
3468 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
3469 my $sequence_pair_fails = 0; | |
3470 ### Declaring an empty hash reference which will store all information we need for the methylation call | |
3471 my $methylation_call_params; # hash reference! | |
3472 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the | |
3473 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the | |
3474 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether | |
3475 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ | |
3476 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n"; | |
3477 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){ | |
3478 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n"; | |
3479 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n"; | |
3480 } | |
3481 if (scalar keys %{$mismatches{$mismatch_number}} == 1){ | |
3482 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n"; | |
3483 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ | |
3484 $methylation_call_params->{$identifier}->{seq_id} = $identifier; | |
3485 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1}; | |
3486 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}; | |
3487 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; | |
3488 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1}; | |
3489 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}; | |
3490 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2})); | |
3491 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; | |
3492 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1}; | |
3493 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2}; | |
3494 } | |
3495 } | |
3496 else{ | |
3497 $sequence_pair_fails = 1; | |
3498 } | |
3499 ### after processing the alignment with the lowest number of mismatches we exit | |
3500 last; | |
3501 } | |
3502 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions | |
3503 if ($sequence_pair_fails == 1){ | |
3504 $counting{unsuitable_sequence_count}++; | |
3505 if ($ambiguous){ | |
3506 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified | |
3507 } | |
3508 if ($unmapped){ | |
3509 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified | |
3510 } | |
3511 else{ | |
3512 return 0; # => exits to next sequence (default) | |
3513 } | |
3514 } | |
3515 | |
3516 ### --DIRECTIONAL | |
3517 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
3518 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
3519 if ($directional){ | |
3520 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ | |
3521 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
3522 $counting{alignments_rejected_count}++; | |
3523 return 0; | |
3524 } | |
3525 } | |
3526 | |
3527 ### If the sequence has not been rejected so far it does have a unique best alignment | |
3528 $counting{unique_best_alignment_count}++; | |
3529 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params); | |
3530 | |
3531 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call | |
3532 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ | |
3533 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; | |
3534 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
3535 return 0; | |
3536 } | |
3537 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ | |
3538 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; | |
3539 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
3540 return 0; | |
3541 } | |
3542 | |
3543 ### otherwise we are set to perform the actual methylation call | |
3544 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); | |
3545 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); | |
3546 | |
3547 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); | |
3548 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 | |
3549 } | |
3550 | |
3551 ######################### | |
3552 ### BOWTIE 2 | PAIRED-END | |
3553 ######################### | |
3554 | |
3555 sub check_bowtie_results_paired_ends_bowtie2{ | |
3556 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; | |
3557 | |
3558 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 | |
3559 unless ($quality_value_1){ | |
3560 $quality_value_1 = 'I'x(length$sequence_1); | |
3561 } | |
3562 | |
3563 unless ($quality_value_2){ | |
3564 $quality_value_2 = 'I'x(length$sequence_2); | |
3565 } | |
3566 | |
3567 | |
3568 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; | |
3569 | |
3570 | |
3571 my %alignments; | |
3572 my $alignment_ambiguous = 0; | |
3573 my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far | |
3574 my $amb_same_thread = 0; ## if a reads primary and secondary alignments have the same alignment score we set this to true. | |
3575 | |
3576 ### reading from the Bowtie 2 output filehandles | |
3577 | |
3578 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. | |
3579 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). | |
3580 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) | |
3581 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary | |
3582 ### strands are not being reported when '--directional' is specified | |
3583 | |
3584 foreach my $index (0,3,1,2){ | |
3585 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
3586 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); | |
3587 | |
3588 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it | |
3589 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
3590 | |
3591 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10]; | |
3592 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10]; | |
3593 # print "Index: $index\t$fhs[$index]->{last_line_1}\n"; | |
3594 # print "Index: $index\t$fhs[$index]->{last_line_2}\n"; | |
3595 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n"; | |
3596 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n"; | |
3597 $id_1 =~ s/\/1$//; | |
3598 $id_2 =~ s/\/2$//; | |
3599 | |
3600 # SAM format specifications for Bowtie 2 | |
3601 # (1) Name of read that aligned | |
3602 # (2) Sum of all applicable flags. Flags relevant to Bowtie are: | |
3603 # 1 The read is one of a pair | |
3604 # 2 The alignment is one end of a proper paired-end alignment | |
3605 # 4 The read has no reported alignments | |
3606 # 8 The read is one of a pair and has no reported alignments | |
3607 # 16 The alignment is to the reverse reference strand | |
3608 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand | |
3609 # 64 The read is mate 1 in a pair | |
3610 # 128 The read is mate 2 in a pair | |
3611 # 256 The read has multiple mapping states | |
3612 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) | |
3613 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) | |
3614 # (5) Mapping quality (255 means MAPQ is not available) | |
3615 # (6) CIGAR string representation of alignment (* if unavailable) | |
3616 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. | |
3617 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. | |
3618 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. | |
3619 # (10) Read sequence (reverse-complemented if aligned to the reverse strand) | |
3620 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. | |
3621 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: | |
3622 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. | |
3623 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. | |
3624 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. | |
3625 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. | |
3626 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read. | |
3627 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
3628 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
3629 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. | |
3630 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. | |
3631 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. | |
3632 | |
3633 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128). | |
3634 ### We can store the next alignment and move on to the next Bowtie 2 instance | |
3635 if ($flag_1 == 77 and $flag_2 == 141){ | |
3636 ## reading in the next alignment, which must be the next sequence | |
3637 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
3638 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
3639 | |
3640 if ($newline_1 and $newline_2){ | |
3641 chomp $newline_1; | |
3642 chomp $newline_2; | |
3643 my ($seq_id_1) = split (/\t/,$newline_1); | |
3644 my ($seq_id_2) = split (/\t/,$newline_2); | |
3645 $seq_id_1 =~ s/\/1$//; | |
3646 $seq_id_2 =~ s/\/2$//; | |
3647 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
3648 $fhs[$index]->{last_line_1} = $newline_1; | |
3649 $fhs[$index]->{last_line_2} = $newline_2; | |
3650 | |
3651 # print "current sequence ($identifier) did not map, reading in next sequence\n"; | |
3652 # print "$index\t$fhs[$index]->{last_seq_id}\n"; | |
3653 # print "$index\t$fhs[$index]->{last_line_1}\n"; | |
3654 # print "$index\t$fhs[$index]->{last_line_2}\n"; | |
3655 next; # next instance | |
3656 } | |
3657 else{ | |
3658 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
3659 $fhs[$index]->{last_seq_id} = undef; | |
3660 $fhs[$index]->{last_line_1} = undef; | |
3661 $fhs[$index]->{last_line_2} = undef; | |
3662 next; | |
3663 } | |
3664 } | |
3665 | |
3666 ### If there are one or more proper alignments we can extract the chromosome number | |
3667 my ($chromosome_1,$chromosome_2); | |
3668 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ | |
3669 $chromosome_1 = $mapped_chromosome_1; | |
3670 } | |
3671 else{ | |
3672 die "Chromosome number extraction failed for $mapped_chromosome_1\n"; | |
3673 } | |
3674 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ | |
3675 $chromosome_2 = $mapped_chromosome_2; | |
3676 } | |
3677 else{ | |
3678 die "Chromosome number extraction failed for $mapped_chromosome_2\n"; | |
3679 } | |
3680 | |
3681 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); | |
3682 | |
3683 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string | |
3684 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2); | |
3685 | |
3686 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1}); | |
3687 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2}); | |
3688 | |
3689 foreach (11..$#fields_1){ | |
3690 if ($fields_1[$_] =~ /AS:i:(.*)/){ | |
3691 $alignment_score_1 = $1; | |
3692 } | |
3693 elsif ($fields_1[$_] =~ /XS:i:(.*)/){ | |
3694 $second_best_1 = $1; | |
3695 } | |
3696 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){ | |
3697 $MD_tag_1 = $1; | |
3698 } | |
3699 } | |
3700 | |
3701 foreach (11..$#fields_2){ | |
3702 if ($fields_2[$_] =~ /AS:i:(.*)/){ | |
3703 $alignment_score_2 = $1; | |
3704 } | |
3705 elsif ($fields_2[$_] =~ /XS:i:(.*)/){ | |
3706 $second_best_2 = $1; | |
3707 } | |
3708 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){ | |
3709 $MD_tag_2 = $1; | |
3710 } | |
3711 } | |
3712 | |
3713 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1); | |
3714 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2); | |
3715 | |
3716 # warn "First read 1 alignment score is: '$alignment_score_1'\n"; | |
3717 # warn "First read 2 alignment score is: '$alignment_score_2'\n"; | |
3718 # warn "MD tag 1 is: '$MD_tag_1'\n"; | |
3719 # warn "MD tag 2 is: '$MD_tag_2'\n"; | |
3720 | |
3721 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments | |
3722 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ; | |
3723 # warn "sum of alignment scores: $sum_of_alignment_scores_1\n\n"; | |
3724 | |
3725 if (!defined $best_AS_so_far){ | |
3726 $best_AS_so_far = $sum_of_alignment_scores_1; | |
3727 # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n"; | |
3728 } | |
3729 else{ | |
3730 if ($sum_of_alignment_scores_1 > $best_AS_so_far){ # AS are generally negative with a maximum of 0 | |
3731 $best_AS_so_far = $sum_of_alignment_scores_1; | |
3732 # warn "Found better sum of alignment scores ($sum_of_alignment_scores), setting \$best_AS_so_far to $best_AS_so_far\n"; | |
3733 # resetting the ambiguous within thread memory (if applicable at all) | |
3734 # warn "Resetting amb within thread value to 0\n"; | |
3735 $amb_same_thread = 0; | |
3736 } | |
3737 else{ | |
3738 # warn "current alignment (AS $sum_of_alignment_scores) isn't better than the best so far ($best_AS_so_far). Not changing anything\n"; | |
3739 } | |
3740 } | |
3741 | |
3742 if (defined $second_best_1 and defined $second_best_2){ | |
3743 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2; | |
3744 # warn "Second best alignment_score_1 is: '$second_best_1'\n"; | |
3745 # warn "Second best alignment_score_2 is: '$second_best_2'\n"; | |
3746 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n"; | |
3747 | |
3748 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we we keep a memory of this | |
3749 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){ | |
3750 | |
3751 # checking to see if this read pair produced the best alignment | |
3752 if ($sum_of_alignment_scores_1 == $best_AS_so_far){ # yes this is the best read pair so far, either within the thread or between threads, however it is ambiguous | |
3753 # warn "Read pair is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n"; | |
3754 $amb_same_thread = 1; | |
3755 } | |
3756 else{ | |
3757 # warn "This read pair has a worse alignment score than the best alignment so far and will be ignored even though it is ambiguous in itself\n"; | |
3758 } | |
3759 | |
3760 ### if there is a better alignment later on -> fine. If not, the read will get booted altogether one way or another | |
3761 | |
3762 ## need to read and discard all additional ambiguous reads until we reach the next sequence | |
3763 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
3764 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
3765 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
3766 if ($newline_1 and $newline_2){ | |
3767 chomp $newline_1; | |
3768 chomp $newline_2; | |
3769 my ($seq_id_1) = split (/\t/,$newline_1); | |
3770 my ($seq_id_2) = split (/\t/,$newline_2); | |
3771 $seq_id_1 =~ s/\/1$//; | |
3772 $seq_id_2 =~ s/\/2$//; | |
3773 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; | |
3774 | |
3775 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
3776 $fhs[$index]->{last_line_1} = $newline_1; | |
3777 $fhs[$index]->{last_line_2} = $newline_2; | |
3778 } | |
3779 else{ | |
3780 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
3781 $fhs[$index]->{last_seq_id} = undef; | |
3782 $fhs[$index]->{last_line_1} = undef; | |
3783 $fhs[$index]->{last_line_2} = undef; | |
3784 last; # break free if the end of the alignment output was reached | |
3785 } | |
3786 } | |
3787 # if ($fhs[$index]->{last_seq_id}){ | |
3788 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; | |
3789 # } | |
3790 } | |
3791 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment | |
3792 | |
3793 my $alignment_location; | |
3794 if ($position_1 <= $position_2){ | |
3795 $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
3796 } | |
3797 elsif($position_2 < $position_1){ | |
3798 $alignment_location = join(":",$chromosome_1,$position_2,$position_1); | |
3799 } | |
3800 | |
3801 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
3802 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
3803 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
3804 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB | |
3805 | |
3806 unless (exists $alignments{$alignment_location}){ | |
3807 $alignments{$alignment_location}->{seq_id} = $id_1; | |
3808 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; | |
3809 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; | |
3810 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; | |
3811 $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = $sum_of_alignment_scores_second_best; | |
3812 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; | |
3813 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; | |
3814 $alignments{$alignment_location}->{index} = $index; | |
3815 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine | |
3816 $alignments{$alignment_location}->{position_1} = $position_1; | |
3817 $alignments{$alignment_location}->{position_2} = $position_2; | |
3818 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; | |
3819 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; | |
3820 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; | |
3821 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; | |
3822 $alignments{$alignment_location}->{flag_1} = $flag_1; | |
3823 $alignments{$alignment_location}->{flag_2} = $flag_2; | |
3824 } | |
3825 # warn "added best of several alignments to \%alignments hash\n"; | |
3826 | |
3827 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence | |
3828 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
3829 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
3830 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
3831 if ($newline_1 and $newline_2){ | |
3832 chomp $newline_1; | |
3833 chomp $newline_2; | |
3834 my ($seq_id_1) = split (/\t/,$newline_1); | |
3835 my ($seq_id_2) = split (/\t/,$newline_2); | |
3836 $seq_id_1 =~ s/\/1$//; | |
3837 $seq_id_2 =~ s/\/2$//; | |
3838 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; | |
3839 | |
3840 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
3841 $fhs[$index]->{last_line_1} = $newline_1; | |
3842 $fhs[$index]->{last_line_2} = $newline_2; | |
3843 } | |
3844 else{ | |
3845 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) | |
3846 $fhs[$index]->{last_seq_id} = undef; | |
3847 $fhs[$index]->{last_line_1} = undef; | |
3848 $fhs[$index]->{last_line_2} = undef; | |
3849 last; # break free if the end of the alignment output was reached | |
3850 } | |
3851 } | |
3852 # if($fhs[$index]->{last_seq_id}){ | |
3853 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n"; | |
3854 # } | |
3855 } | |
3856 } | |
3857 else{ # there is no second best hit, so we can just store this one and read in the next sequence | |
3858 | |
3859 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
3860 # print "$alignment_location\n"; | |
3861 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
3862 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
3863 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
3864 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB | |
3865 | |
3866 unless (exists $alignments{$alignment_location}){ | |
3867 $alignments{$alignment_location}->{seq_id} = $id_1; | |
3868 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; | |
3869 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; | |
3870 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; | |
3871 $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = undef; | |
3872 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; | |
3873 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; | |
3874 $alignments{$alignment_location}->{index} = $index; | |
3875 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine | |
3876 $alignments{$alignment_location}->{position_1} = $position_1; | |
3877 $alignments{$alignment_location}->{position_2} = $position_2; | |
3878 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; | |
3879 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; | |
3880 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; | |
3881 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; | |
3882 $alignments{$alignment_location}->{flag_1} = $flag_1; | |
3883 $alignments{$alignment_location}->{flag_2} = $flag_2; | |
3884 } | |
3885 | |
3886 # warn "added unique alignment to \%alignments hash\n"; | |
3887 | |
3888 # Now reading and storing the next read pair | |
3889 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
3890 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
3891 if ($newline_1 and $newline_2){ | |
3892 chomp $newline_1; | |
3893 chomp $newline_2; | |
3894 # print "$newline_1\n"; | |
3895 # print "$newline_2\n"; | |
3896 my ($seq_id_1) = split (/\t/,$newline_1); | |
3897 my ($seq_id_2) = split (/\t/,$newline_2); | |
3898 $seq_id_1 =~ s/\/1$//; | |
3899 $seq_id_2 =~ s/\/2$//; | |
3900 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; | |
3901 | |
3902 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
3903 $fhs[$index]->{last_line_1} = $newline_1; | |
3904 $fhs[$index]->{last_line_2} = $newline_2; | |
3905 | |
3906 if ($seq_id_1 eq $identifier){ | |
3907 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; | |
3908 } | |
3909 } | |
3910 else{ | |
3911 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) | |
3912 $fhs[$index]->{last_seq_id} = undef; | |
3913 $fhs[$index]->{last_line_1} = undef; | |
3914 $fhs[$index]->{last_line_2} = undef; | |
3915 } | |
3916 } | |
3917 } | |
3918 } | |
3919 | |
3920 ### If there were several equally good alignments for the best alignment score we will boot the read | |
3921 if ($amb_same_thread){ | |
3922 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; | |
3923 $alignment_ambiguous = 1; | |
3924 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; | |
3925 } | |
3926 else{ | |
3927 # warn "alignment won't be considered ambiguous. This time....\n"; | |
3928 } | |
3929 | |
3930 | |
3931 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format | |
3932 if ($alignment_ambiguous == 1){ | |
3933 $counting{unsuitable_sequence_count}++; | |
3934 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
3935 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); | |
3936 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); | |
3937 # print "$ambiguous_read_1\n"; | |
3938 # print "$ambiguous_read_2\n"; | |
3939 | |
3940 if ($ambiguous){ | |
3941 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified | |
3942 } | |
3943 elsif ($unmapped){ | |
3944 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified | |
3945 } | |
3946 else{ | |
3947 return 0; | |
3948 } | |
3949 } | |
3950 | |
3951 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file | |
3952 unless (%alignments){ | |
3953 $counting{no_single_alignment_found}++; | |
3954 | |
3955 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); | |
3956 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); | |
3957 # print "$unmapped_read_1\n"; | |
3958 # print "$unmapped_read_2\n"; | |
3959 if ($unmapped){ | |
3960 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified | |
3961 } | |
3962 else{ | |
3963 return 0; | |
3964 } | |
3965 } | |
3966 | |
3967 ####################################################################################################################################################### | |
3968 | |
3969 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one | |
3970 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) | |
3971 ### alignment score we are discarding the sequence pair altogether. | |
3972 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5) | |
3973 ### and extending (3 per bp) the gap. | |
3974 | |
3975 ####################################################################################################################################################### | |
3976 | |
3977 ### Declaring an empty hash reference which will store all information we need for the methylation call | |
3978 my $methylation_call_params; # hash reference | |
3979 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
3980 | |
3981 ### print contents of %alignments for debugging | |
3982 ## if (scalar keys %alignments >= 1){ | |
3983 # print "\n******\n"; | |
3984 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ | |
3985 # print "Loc: $alignment_location\n"; | |
3986 # print "ID: $alignments{$alignment_location}->{seq_id}\n"; | |
3987 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n"; | |
3988 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n"; | |
3989 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n"; | |
3990 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n"; | |
3991 # print "Index $alignments{$alignment_location}->{index}\n"; | |
3992 # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; | |
3993 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n"; | |
3994 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n"; | |
3995 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n"; | |
3996 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n"; | |
3997 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n"; | |
3998 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n"; | |
3999 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n"; | |
4000 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n"; | |
4001 # } | |
4002 # print "\n******\n"; | |
4003 # } | |
4004 | |
4005 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment | |
4006 if (scalar keys %alignments == 1){ | |
4007 for my $unique_best_alignment (keys %alignments){ | |
4008 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1}; | |
4009 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2}; | |
4010 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; | |
4011 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1}; | |
4012 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2}; | |
4013 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; | |
4014 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1}; | |
4015 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2}; | |
4016 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores}; | |
4017 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores_second_best}; | |
4018 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1}; | |
4019 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2}; | |
4020 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1}; | |
4021 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2}; | |
4022 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1}; | |
4023 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2}; | |
4024 } | |
4025 } | |
4026 | |
4027 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case | |
4028 ### we boot the sequence pair altogether) | |
4029 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ | |
4030 my $best_sum_of_alignment_scores; | |
4031 my $best_alignment_location; | |
4032 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){ | |
4033 | |
4034 # warn "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; sleep(1); | |
4035 | |
4036 unless (defined $best_sum_of_alignment_scores){ | |
4037 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores}; | |
4038 $best_alignment_location = $alignment_location; | |
4039 # print "setting best alignment score to: $best_sum_of_alignment_scores\n"; | |
4040 } | |
4041 else{ | |
4042 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted | |
4043 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){ | |
4044 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n"; | |
4045 $sequence_pair_fails = 1; | |
4046 last; # exiting since we know that the sequence has ambiguous alignments | |
4047 } | |
4048 ### else we are going to store the best alignment for further processing | |
4049 else{ | |
4050 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1}; | |
4051 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2}; | |
4052 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; | |
4053 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1}; | |
4054 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2}; | |
4055 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; | |
4056 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1}; | |
4057 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2}; | |
4058 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores}; | |
4059 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1}; | |
4060 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2}; | |
4061 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1}; | |
4062 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2}; | |
4063 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1}; | |
4064 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2}; | |
4065 | |
4066 if (defined $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} and ( $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} > $alignments{$alignment_location}->{sum_of_alignment_scores} )) { | |
4067 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best}; | |
4068 } | |
4069 else { | |
4070 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$alignment_location}->{sum_of_alignment_scores}; | |
4071 } | |
4072 | |
4073 last; # exiting since the sequence produced a unique best alignment | |
4074 } | |
4075 } | |
4076 } | |
4077 } | |
4078 else{ | |
4079 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";; | |
4080 } | |
4081 | |
4082 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions | |
4083 if ($sequence_pair_fails == 1){ | |
4084 $counting{unsuitable_sequence_count}++; | |
4085 | |
4086 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
4087 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); | |
4088 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); | |
4089 # warn "$ambiguous_read_1\n"; | |
4090 # warn "$ambiguous_read_2\n"; | |
4091 | |
4092 if ($ambiguous){ | |
4093 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified | |
4094 } | |
4095 elsif ($unmapped){ | |
4096 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified | |
4097 } | |
4098 else{ | |
4099 return 0; # => exits to next sequence pair (default) | |
4100 } | |
4101 } | |
4102 | |
4103 ### --DIRECTIONAL | |
4104 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
4105 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
4106 if ($directional){ | |
4107 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ | |
4108 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
4109 $counting{alignments_rejected_count}++; | |
4110 return 0; | |
4111 } | |
4112 } | |
4113 | |
4114 ### If the sequence pair has not been rejected so far it does have a unique best alignment | |
4115 $counting{unique_best_alignment_count}++; | |
4116 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); | |
4117 | |
4118 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call | |
4119 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ | |
4120 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; | |
4121 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
4122 return 0; | |
4123 } | |
4124 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ | |
4125 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; | |
4126 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
4127 return 0; | |
4128 } | |
4129 | |
4130 ### Compute MAPQ value | |
4131 $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence_1), length($sequence_2), | |
4132 $methylation_call_params->{$identifier}->{sum_of_alignment_scores}, | |
4133 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best}); | |
4134 | |
4135 | |
4136 ### now we are set to perform the actual methylation call | |
4137 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); | |
4138 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); | |
4139 # warn "$methylation_call_params->{$identifier}->{read_conversion_2}\n"; | |
4140 # warn " $sequence_2\n"; | |
4141 # warn "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n"; | |
4142 # warn " $methylation_call_params->{$identifier}->{methylation_call_2}\n"; | |
4143 | |
4144 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); | |
4145 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 | |
4146 } | |
4147 | |
4148 ### | |
4149 | |
4150 # Compute MAPQ value for a read or read pair as in Bowtie2-2.2.2 (specifically, V2 of the MAPQ calculator: "class BowtieMapq2") | |
4151 # assuming end-to-end alignment with the default calculation of the minimum alignment score | |
4152 | |
4153 sub calc_mapq { | |
4154 my ($read1Len, $read2Len, $AS_best, $AS_secBest) = @_; | |
4155 | |
4156 my $scMin = $score_min_intercept + $score_min_slope * $read1Len; | |
4157 ### read2Len is only defined for paired-end reads, so for single-end mode we can just a score min value for read 1 | |
4158 if (defined $read2Len){ | |
4159 $scMin += $score_min_intercept + $score_min_slope * $read2Len; | |
4160 } | |
4161 | |
4162 my $diff = abs$scMin; # scores can vary by up to this much (since max AS is 0 for end-to-end alignment) | |
4163 my $bestOver = $AS_best - $scMin; | |
4164 | |
4165 if (!defined $AS_secBest) { | |
4166 if ($bestOver >= $diff * 0.8) { return 42; } | |
4167 elsif ($bestOver >= $diff * 0.7) { return 40; } | |
4168 elsif ($bestOver >= $diff * 0.6) { return 24; } | |
4169 elsif ($bestOver >= $diff * 0.5) { return 23; } | |
4170 elsif ($bestOver >= $diff * 0.4) { return 8; } | |
4171 elsif ($bestOver >= $diff * 0.3) { return 3; } | |
4172 else { return 0; } | |
4173 } else { | |
4174 my $bestDiff = abs(abs($AS_best) - abs($AS_secBest)); | |
4175 if ($bestDiff >= $diff * 0.9) { | |
4176 if ($bestOver == $diff) { | |
4177 return 39; | |
4178 } else { | |
4179 return 33; | |
4180 } | |
4181 } elsif ($bestDiff >= $diff * 0.8) { | |
4182 if ($bestOver == $diff) { | |
4183 return 38; | |
4184 } else { | |
4185 return 27; | |
4186 } | |
4187 } elsif ($bestDiff >= $diff * 0.7) { | |
4188 if ($bestOver == $diff) { | |
4189 return 37; | |
4190 } else { | |
4191 return 26; | |
4192 } | |
4193 } elsif ($bestDiff >= $diff * 0.6) { | |
4194 if ($bestOver == $diff) { | |
4195 return 36; | |
4196 } else { | |
4197 return 22; | |
4198 } | |
4199 } elsif ($bestDiff >= $diff * 0.5) { | |
4200 if ($bestOver == $diff) { | |
4201 return 35; | |
4202 } elsif ($bestOver >= $diff * 0.84) { | |
4203 return 25; | |
4204 } elsif ($bestOver >= $diff * 0.68) { | |
4205 return 16; | |
4206 } else { | |
4207 return 5; | |
4208 } | |
4209 } elsif ($bestDiff >= $diff * 0.4) { | |
4210 if ($bestOver == $diff) { | |
4211 return 34; | |
4212 } elsif ($bestOver >= $diff * 0.84) { | |
4213 return 21; | |
4214 } elsif ($bestOver >= $diff * 0.68) { | |
4215 return 14; | |
4216 } else { | |
4217 return 4; | |
4218 } | |
4219 } elsif ($bestDiff >= $diff * 0.3) { | |
4220 if ($bestOver == $diff) { | |
4221 return 32; | |
4222 } elsif ($bestOver >= $diff * 0.88) { | |
4223 return 18; | |
4224 } elsif ($bestOver >= $diff * 0.67) { | |
4225 return 15; | |
4226 } else { | |
4227 return 3; | |
4228 } | |
4229 } elsif ($bestDiff >= $diff * 0.2) { | |
4230 if ($bestOver == $diff) { | |
4231 return 31; | |
4232 } elsif ($bestOver >= $diff * 0.88) { | |
4233 return 17; | |
4234 } elsif ($bestOver >= $diff * 0.67) { | |
4235 return 11; | |
4236 } else { | |
4237 return 0; | |
4238 } | |
4239 } elsif ($bestDiff >= $diff * 0.1) { | |
4240 if ($bestOver == $diff) { | |
4241 return 30; | |
4242 } elsif ($bestOver >= $diff * 0.88) { | |
4243 return 12; | |
4244 } elsif ($bestOver >= $diff * 0.67) { | |
4245 return 7; | |
4246 } else { | |
4247 return 0; | |
4248 } | |
4249 } elsif ($bestDiff > 0) { | |
4250 if ($bestOver >= $diff * 0.67) { | |
4251 return 6; | |
4252 } else { | |
4253 return 2; | |
4254 } | |
4255 } else { | |
4256 if ($bestOver >= $diff * 0.67) { | |
4257 return 1; | |
4258 } else { | |
4259 return 0; | |
4260 } | |
4261 } | |
4262 } | |
4263 } | |
4264 | |
4265 | |
4266 ### | |
4267 | |
4268 sub decide_whether_paired_end_alignment_is_valid{ | |
4269 my ($index,$identifier) = @_; | |
4270 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; | |
4271 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; | |
4272 chomp $mismatch_info_1; | |
4273 chomp $mismatch_info_2; | |
4274 my $seq_id_1 = $id_1; | |
4275 my $seq_id_2 = $id_2; | |
4276 $seq_id_1 =~ s/\/1$//; # removing the read /1 | |
4277 $seq_id_2 =~ s/\/1$//; # removing the read /1 | |
4278 | |
4279 ### ensuring that the current entry is the correct sequence | |
4280 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ | |
4281 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically | |
4282 ### sensible alignments | |
4283 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); | |
4284 ### If the orientation was correct can we move on | |
4285 if ($orientation == 1){ | |
4286 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS | |
4287 } | |
4288 ### If the alignment was in the wrong orientation we need to read in two new lines | |
4289 elsif($orientation == 0){ | |
4290 my $newline_1 = $fhs[$index]->{fh}->getline(); | |
4291 my $newline_2 = $fhs[$index]->{fh}->getline(); | |
4292 if ($newline_1 and $newline_2){ | |
4293 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time) | |
4294 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1]; | |
4295 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1]; | |
4296 | |
4297 my $seqid; | |
4298 $seq_id_1 = $id_1; | |
4299 $seq_id_2 = $id_2; | |
4300 # we need to capture the first read (ending on /1) | |
4301 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
4302 $seqid = $seq_id_1; | |
4303 } | |
4304 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
4305 $seqid = $seq_id_2; | |
4306 } | |
4307 else{ | |
4308 die "One of the two reads needs to end on /1!!"; | |
4309 } | |
4310 | |
4311 ### ensuring that the next entry is still the correct sequence | |
4312 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ | |
4313 ### checking orientation again | |
4314 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); | |
4315 ### If the orientation was correct can we move on | |
4316 if ($orientation == 1){ | |
4317 ### Writing the current sequence to last_line_1 and last_line_2 | |
4318 $fhs[$index]->{last_seq_id} = $seqid; | |
4319 $fhs[$index]->{last_line_1} = $newline_1; | |
4320 $fhs[$index]->{last_line_2} = $newline_2; | |
4321 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS | |
4322 } | |
4323 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be | |
4324 ### the next entry) | |
4325 elsif ($orientation == 0){ | |
4326 $newline_1 = $fhs[$index]->{fh}->getline(); | |
4327 $newline_2 = $fhs[$index]->{fh}->getline(); | |
4328 if ($newline_1 and $newline_2){ | |
4329 ($seq_id_1) = split (/\t/,$newline_1); | |
4330 ($seq_id_2) = split (/\t/,$newline_2); | |
4331 | |
4332 $seqid = ''; | |
4333 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
4334 $seqid = $seq_id_1; | |
4335 } | |
4336 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
4337 $seqid = $seq_id_2; | |
4338 } | |
4339 else{ | |
4340 die "One of the two reads needs to end on /1!!"; | |
4341 } | |
4342 | |
4343 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with | |
4344 ### the same fields of the just read next entry | |
4345 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier); | |
4346 $fhs[$index]->{last_seq_id} = $seqid; | |
4347 $fhs[$index]->{last_line_1} = $newline_1; | |
4348 $fhs[$index]->{last_line_2} = $newline_2; | |
4349 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation | |
4350 } | |
4351 else { | |
4352 ### assigning undef to last_seq_id and last_line (end of bowtie output) | |
4353 $fhs[$index]->{last_seq_id} = undef; | |
4354 $fhs[$index]->{last_line_1} = undef; | |
4355 $fhs[$index]->{last_line_2} = undef; | |
4356 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation | |
4357 } | |
4358 } | |
4359 else{ | |
4360 die "The orientation of the alignment must be either correct or incorrect\n"; | |
4361 } | |
4362 } | |
4363 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs | |
4364 else{ | |
4365 $fhs[$index]->{last_seq_id} = $seqid; | |
4366 $fhs[$index]->{last_line_1} = $newline_1; | |
4367 $fhs[$index]->{last_line_2} = $newline_2; | |
4368 return 0; # processing the new alignment result only in the next round | |
4369 } | |
4370 } | |
4371 else { | |
4372 # assigning undef to last_seq_id and both last_lines (end of bowtie output) | |
4373 $fhs[$index]->{last_seq_id} = undef; | |
4374 $fhs[$index]->{last_line_1} = undef; | |
4375 $fhs[$index]->{last_line_2} = undef; | |
4376 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation | |
4377 } | |
4378 } | |
4379 else{ | |
4380 die "The orientation of the alignment must be either correct or incorrect\n"; | |
4381 } | |
4382 } | |
4383 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round | |
4384 else{ | |
4385 return 0; | |
4386 } | |
4387 } | |
4388 | |
4389 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END | |
4390 | |
4391 sub extract_corresponding_genomic_sequence_paired_ends { | |
4392 my ($sequence_identifier,$methylation_call_params) = @_; | |
4393 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the | |
4394 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
4395 my $alignment_read_1; | |
4396 my $alignment_read_2; | |
4397 my $read_conversion_info_1; | |
4398 my $read_conversion_info_2; | |
4399 my $genome_conversion; | |
4400 | |
4401 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call | |
4402 ### if the C happens to be at the first or last position of the actually observed sequence | |
4403 my $non_bisulfite_sequence_1; | |
4404 my $non_bisulfite_sequence_2; | |
4405 | |
4406 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was | |
4407 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic | |
4408 ### sequences around! | |
4409 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) | |
4410 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ | |
4411 ### [Index 0, sequence originated from (converted) forward strand] | |
4412 $counting{CT_GA_CT_count}++; | |
4413 $alignment_read_1 = '+'; | |
4414 $alignment_read_2 = '-'; | |
4415 $read_conversion_info_1 = 'CT'; | |
4416 $read_conversion_info_2 = 'GA'; | |
4417 $genome_conversion = 'CT'; | |
4418 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) | |
4419 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end | |
4420 | |
4421 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change | |
4422 | |
4423 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) | |
4424 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation | |
4425 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1 | |
4426 | |
4427 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); | |
4428 ### the reverse strand sequence needs to be reverse complemented | |
4429 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
4430 } | |
4431 else{ | |
4432 $non_bisulfite_sequence_2 = ''; | |
4433 } | |
4434 } | |
4435 | |
4436 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) | |
4437 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ | |
4438 ### [Index 1, sequence originated from complementary to (converted) reverse strand] | |
4439 $counting{GA_CT_GA_count}++; | |
4440 $alignment_read_1 = '+'; | |
4441 $alignment_read_2 = '-'; | |
4442 $read_conversion_info_1 = 'GA'; | |
4443 $read_conversion_info_2 = 'CT'; | |
4444 $genome_conversion = 'GA'; | |
4445 | |
4446 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) | |
4447 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end | |
4448 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1 | |
4449 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2 | |
4450 } | |
4451 else{ | |
4452 $non_bisulfite_sequence_1 = ''; | |
4453 } | |
4454 | |
4455 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) | |
4456 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation | |
4457 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2 | |
4458 ### the reverse strand sequence needs to be reverse complemented | |
4459 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
4460 } | |
4461 | |
4462 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) | |
4463 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ | |
4464 ### [Index 2, sequence originated from the complementary to (converted) forward strand] | |
4465 $counting{GA_CT_CT_count}++; | |
4466 $alignment_read_1 = '-'; | |
4467 $alignment_read_2 = '+'; | |
4468 $read_conversion_info_1 = 'GA'; | |
4469 $read_conversion_info_2 = 'CT'; | |
4470 $genome_conversion = 'CT'; | |
4471 | |
4472 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! | |
4473 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand | |
4474 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation | |
4475 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2 | |
4476 ### the reverse strand sequence needs to be reverse complemented | |
4477 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
4478 | |
4479 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) | |
4480 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! | |
4481 ### Read 2 is CT converted so we need to capture 2 extra 3' bases | |
4482 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1 | |
4483 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2 | |
4484 } | |
4485 else{ | |
4486 $non_bisulfite_sequence_2 = ''; | |
4487 } | |
4488 } | |
4489 | |
4490 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) | |
4491 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ | |
4492 ### [Index 3, sequence originated from the (converted) reverse strand] | |
4493 $counting{CT_GA_GA_count}++; | |
4494 $alignment_read_1 = '-'; | |
4495 $alignment_read_2 = '+'; | |
4496 $read_conversion_info_1 = 'CT'; | |
4497 $read_conversion_info_2 = 'GA'; | |
4498 $genome_conversion = 'GA'; | |
4499 | |
4500 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! | |
4501 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand | |
4502 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation | |
4503 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1 | |
4504 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2 | |
4505 ### the reverse strand sequence needs to be reverse complemented | |
4506 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
4507 } | |
4508 else{ | |
4509 $non_bisulfite_sequence_1 = ''; | |
4510 } | |
4511 | |
4512 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) | |
4513 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! | |
4514 ### Read 2 is GA converted so we need to capture 2 extra 5' bases | |
4515 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2 | |
4516 } | |
4517 else{ | |
4518 die "Too many bowtie result filehandles\n"; | |
4519 } | |
4520 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
4521 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
4522 | |
4523 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; | |
4524 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; | |
4525 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
4526 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; | |
4527 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; | |
4528 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4529 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
4530 } | |
4531 | |
4532 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END | |
4533 | |
4534 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{ | |
4535 my ($sequence_identifier,$methylation_call_params) = @_; | |
4536 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the | |
4537 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
4538 | |
4539 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; | |
4540 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; | |
4541 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; | |
4542 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; | |
4543 | |
4544 my $contains_deletion_1 = 0; | |
4545 my $contains_deletion_2 = 0; | |
4546 if ($cigar_1 =~ /D/){ | |
4547 $contains_deletion_1 = 1; | |
4548 if ($verbose){ warn "$cigar_1\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_1}\n";} | |
4549 } | |
4550 if ($cigar_2 =~ /D/){ | |
4551 $contains_deletion_2 = 1; | |
4552 if ($verbose){ warn "$cigar_2\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_2}\n";} | |
4553 } | |
4554 | |
4555 # warn "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; | |
4556 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and | |
4557 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence | |
4558 | |
4559 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
4560 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
4561 my $alignment_read_1; | |
4562 my $alignment_read_2; | |
4563 my $read_conversion_info_1; | |
4564 my $read_conversion_info_2; | |
4565 my $genome_conversion; | |
4566 | |
4567 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call | |
4568 ### if the C happens to be at the last position of the actually observed sequence | |
4569 my $non_bisulfite_sequence_1 = ''; | |
4570 my $non_bisulfite_sequence_2 = ''; | |
4571 my $genomic_seq_for_MD_tag_1 = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output | |
4572 my $genomic_seq_for_MD_tag_2 = ''; | |
4573 | |
4574 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings | |
4575 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1; | |
4576 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1; | |
4577 | |
4578 # parsing CIGAR 1 string | |
4579 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation | |
4580 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation | |
4581 shift @ops_1; # remove the empty first element | |
4582 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); | |
4583 # parsing CIGAR 2 string | |
4584 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation | |
4585 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation | |
4586 shift @ops_2; # remove the empty first element | |
4587 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); | |
4588 | |
4589 my $indels_1 = 0; # adding these to the hemming distance value (needed for the NM field in the final SAM output | |
4590 my $indels_2 = 0; | |
4591 | |
4592 ### Extracting read 1 genomic sequence ### | |
4593 | |
4594 # extracting 2 additional bp at the 5' end (read 1) | |
4595 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ | |
4596 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
4597 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise | |
4598 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4599 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1; | |
4600 return; | |
4601 } | |
4602 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2); | |
4603 } | |
4604 | |
4605 foreach (0..$#len_1){ | |
4606 if ($ops_1[$_] eq 'M'){ | |
4607 # extracting genomic sequence | |
4608 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); | |
4609 if ($contains_deletion_1){ | |
4610 $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); | |
4611 } | |
4612 # warn "$non_bisulfite_sequence_1\n"; | |
4613 # adjusting position | |
4614 $pos_1 += $len_1[$_]; | |
4615 } | |
4616 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence | |
4617 # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls, and we can later ignore it for the generation of the MD;Z: tag | |
4618 $non_bisulfite_sequence_1 .= 'X' x $len_1[$_]; | |
4619 if ($contains_deletion_1){ | |
4620 $genomic_seq_for_MD_tag_1 .= 'X' x $len_1[$_]; | |
4621 } | |
4622 # warn "$non_bisulfite_sequence_1\n"; | |
4623 # position doesn't need adjusting | |
4624 | |
4625 ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist() | |
4626 # indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
4627 } | |
4628 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence | |
4629 # we do not add any genomic sequence but only adjust the position | |
4630 # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later | |
4631 if ($contains_deletion_1){ | |
4632 $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); | |
4633 } | |
4634 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n"; | |
4635 $pos_1 += $len_1[$_]; | |
4636 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
4637 } | |
4638 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die | |
4639 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
4640 } | |
4641 else{ | |
4642 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
4643 } | |
4644 } | |
4645 | |
4646 ### 3' end of read 1 | |
4647 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ | |
4648 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
4649 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise | |
4650 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4651 return; | |
4652 } | |
4653 | |
4654 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); | |
4655 } | |
4656 | |
4657 | |
4658 ### Extracting read 2 genomic sequence ### | |
4659 | |
4660 ### 5' end of read 2 | |
4661 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ | |
4662 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
4663 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise | |
4664 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4665 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
4666 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2; | |
4667 return; | |
4668 } | |
4669 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2); | |
4670 } | |
4671 | |
4672 foreach (0..$#len_2){ | |
4673 if ($ops_2[$_] eq 'M'){ | |
4674 # extracting genomic sequence | |
4675 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); | |
4676 if ($contains_deletion_2){ | |
4677 $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); | |
4678 } | |
4679 # warn "$non_bisulfite_sequence_2\n"; | |
4680 # adjusting position | |
4681 $pos_2 += $len_2[$_]; | |
4682 } | |
4683 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence | |
4684 # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can ignore this later during the generation of the MD:Z: tag | |
4685 $non_bisulfite_sequence_2 .= 'X' x $len_2[$_]; | |
4686 if ($contains_deletion_2){ | |
4687 $genomic_seq_for_MD_tag_2 .= 'X' x $len_2[$_]; | |
4688 } | |
4689 # warn "$non_bisulfite_sequence_2\n"; | |
4690 # position doesn't need adjusting | |
4691 | |
4692 ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist() | |
4693 # $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
4694 } | |
4695 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence | |
4696 # we do not add any genomic sequence but only adjust the position | |
4697 # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later | |
4698 if ($contains_deletion_2){ | |
4699 $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); | |
4700 } | |
4701 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n"; | |
4702 $pos_2 += $len_2[$_]; | |
4703 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
4704 } | |
4705 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die | |
4706 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
4707 } | |
4708 else{ | |
4709 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
4710 } | |
4711 } | |
4712 | |
4713 ### 3' end of read 2 | |
4714 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ | |
4715 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
4716 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise | |
4717 # need to set read 1 as well now to prevent warning | |
4718 # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; | |
4719 # sleep(5); | |
4720 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4721 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
4722 return; | |
4723 } | |
4724 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); | |
4725 } | |
4726 | |
4727 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was | |
4728 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly | |
4729 | |
4730 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) | |
4731 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ | |
4732 ### [Index 0, sequence originated from (converted) forward strand] | |
4733 $counting{CT_GA_CT_count}++; | |
4734 $alignment_read_1 = '+'; | |
4735 $alignment_read_2 = '-'; | |
4736 $read_conversion_info_1 = 'CT'; | |
4737 $read_conversion_info_2 = 'GA'; | |
4738 $genome_conversion = 'CT'; | |
4739 ### Read 1 is always the forward hit | |
4740 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented | |
4741 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
4742 if ($contains_deletion_2){ | |
4743 $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2); | |
4744 } | |
4745 } | |
4746 | |
4747 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) | |
4748 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ | |
4749 ### [Index 1, sequence originated from complementary to (converted) bottom strand] | |
4750 $counting{GA_CT_GA_count}++; | |
4751 $alignment_read_1 = '+'; | |
4752 $alignment_read_2 = '-'; | |
4753 $read_conversion_info_1 = 'GA'; | |
4754 $read_conversion_info_2 = 'CT'; | |
4755 $genome_conversion = 'GA'; | |
4756 ### Read 1 is always the forward hit | |
4757 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented | |
4758 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
4759 if ($contains_deletion_2){ | |
4760 $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2); | |
4761 } | |
4762 } | |
4763 | |
4764 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) | |
4765 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ | |
4766 ### [Index 2, sequence originated from the complementary to (converted) top strand] | |
4767 $counting{GA_CT_CT_count}++; | |
4768 $alignment_read_1 = '-'; | |
4769 $alignment_read_2 = '+'; | |
4770 $read_conversion_info_1 = 'GA'; | |
4771 $read_conversion_info_2 = 'CT'; | |
4772 $genome_conversion = 'CT'; | |
4773 | |
4774 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented | |
4775 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
4776 if ($contains_deletion_1){ | |
4777 $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1); | |
4778 } | |
4779 } | |
4780 | |
4781 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) | |
4782 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ | |
4783 ### [Index 3, sequence originated from the (converted) reverse strand] | |
4784 $counting{CT_GA_GA_count}++; | |
4785 $alignment_read_1 = '-'; | |
4786 $alignment_read_2 = '+'; | |
4787 $read_conversion_info_1 = 'CT'; | |
4788 $read_conversion_info_2 = 'GA'; | |
4789 $genome_conversion = 'GA'; | |
4790 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented | |
4791 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
4792 if ($contains_deletion_1){ | |
4793 $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1); | |
4794 } | |
4795 } | |
4796 else{ | |
4797 die "Too many bowtie result filehandles\n"; | |
4798 } | |
4799 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
4800 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
4801 | |
4802 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; | |
4803 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; | |
4804 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
4805 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; | |
4806 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; | |
4807 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4808 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
4809 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1; | |
4810 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2; | |
4811 | |
4812 ## the end position of a read is stored in $pos | |
4813 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1; | |
4814 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2; | |
4815 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1; | |
4816 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2; | |
4817 } | |
4818 | |
4819 ########################################## | |
4820 ### PRINT SINGLE END RESULTS: Bowtie 1 ### | |
4821 ########################################## | |
4822 | |
4823 sub print_bisulfite_mapping_result_single_end{ | |
4824 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; | |
4825 | |
4826 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
4827 if ($phred64){ | |
4828 $quality_value = convert_phred64_quals_to_phred33($quality_value); | |
4829 } | |
4830 elsif ($solexa){ | |
4831 $quality_value = convert_solexa_quals_to_phred33($quality_value); | |
4832 } | |
4833 | |
4834 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position. | |
4835 $methylation_call_params->{$identifier}->{position} += 1; | |
4836 | |
4837 ### writing every uniquely mapped read and its methylation call to the output file | |
4838 if ($vanilla){ | |
4839 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value); | |
4840 print OUT "$bowtie1_output\n"; | |
4841 } | |
4842 else{ # SAM output, default since Bismark v1.0.0 | |
4843 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script | |
4844 } | |
4845 } | |
4846 | |
4847 ########################################## | |
4848 ### PRINT SINGLE END RESULTS: Bowtie 2 ### | |
4849 ########################################## | |
4850 | |
4851 sub print_bisulfite_mapping_result_single_end_bowtie2{ | |
4852 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; | |
4853 | |
4854 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
4855 if ($phred64){ | |
4856 $quality_value = convert_phred64_quals_to_phred33($quality_value); | |
4857 } | |
4858 elsif ($solexa){ | |
4859 $quality_value = convert_solexa_quals_to_phred33($quality_value); | |
4860 } | |
4861 | |
4862 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed) | |
4863 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script | |
4864 } | |
4865 | |
4866 ########################################## | |
4867 ### PRINT PAIRED END ESULTS: Bowtie 1 ### | |
4868 ########################################## | |
4869 | |
4870 sub print_bisulfite_mapping_results_paired_ends{ | |
4871 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; | |
4872 | |
4873 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
4874 if ($phred64){ | |
4875 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); | |
4876 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); | |
4877 } | |
4878 elsif ($solexa){ | |
4879 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); | |
4880 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); | |
4881 } | |
4882 | |
4883 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based) | |
4884 $methylation_call_params->{$identifier}->{start_seq_1} += 1; | |
4885 | |
4886 ### writing every single aligned read and its methylation call to the output file | |
4887 if ($vanilla){ | |
4888 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2); | |
4889 print OUT "$bowtie1_output_paired_end\n"; | |
4890 } | |
4891 else{ # SAM output, default since Bismark v1.0.0 | |
4892 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script | |
4893 } | |
4894 | |
4895 } | |
4896 | |
4897 ########################################## | |
4898 ### PRINT PAIRED END ESULTS: Bowtie 2 ### | |
4899 ########################################## | |
4900 | |
4901 sub print_bisulfite_mapping_results_paired_ends_bowtie2{ | |
4902 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; | |
4903 | |
4904 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
4905 if ($phred64){ | |
4906 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); | |
4907 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); | |
4908 } | |
4909 elsif ($solexa){ | |
4910 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); | |
4911 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); | |
4912 } | |
4913 | |
4914 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed) | |
4915 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script | |
4916 | |
4917 } | |
4918 | |
4919 | |
4920 sub convert_phred64_quals_to_phred33{ | |
4921 | |
4922 my $qual = shift; | |
4923 my @quals = split (//,$qual); | |
4924 my @new_quals; | |
4925 | |
4926 foreach my $index (0..$#quals){ | |
4927 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]); | |
4928 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); | |
4929 $new_quals[$index] = $phred33_quality_string; | |
4930 } | |
4931 | |
4932 my $phred33_quality = join ("",@new_quals); | |
4933 return $phred33_quality; | |
4934 } | |
4935 | |
4936 sub convert_solexa_quals_to_phred33{ | |
4937 | |
4938 my $qual = shift; | |
4939 my @quals = split (//,$qual); | |
4940 my @new_quals; | |
4941 | |
4942 foreach my $index (0..$#quals){ | |
4943 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]); | |
4944 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); | |
4945 $new_quals[$index] = $phred33_quality_string; | |
4946 } | |
4947 | |
4948 my $phred33_quality = join ("",@new_quals); | |
4949 return $phred33_quality; | |
4950 } | |
4951 | |
4952 sub convert_phred_score_into_phred33_quality_string{ | |
4953 my $qual = shift; | |
4954 $qual = chr($qual+33); | |
4955 return $qual; | |
4956 } | |
4957 | |
4958 sub convert_phred64_quality_string_into_phred_score{ | |
4959 my $string = shift; | |
4960 my $qual = ord($string)-64; | |
4961 return $qual; | |
4962 } | |
4963 | |
4964 sub convert_solexa_pre1_3_quality_string_into_phred_score{ | |
4965 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10 | |
4966 my $string = shift; | |
4967 my $qual = ord($string)-59; | |
4968 return $qual; | |
4969 } | |
4970 | |
4971 | |
4972 sub extract_corresponding_genomic_sequence_single_end { | |
4973 my ($sequence_identifier,$methylation_call_params) = @_; | |
4974 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the | |
4975 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
4976 | |
4977 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
4978 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
4979 my $alignment_strand; | |
4980 my $read_conversion_info; | |
4981 my $genome_conversion; | |
4982 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and | |
4983 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, | |
4984 ### if the C happens to be at the last position of the actually observed sequence | |
4985 my $non_bisulfite_sequence; | |
4986 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end | |
4987 | |
4988 my $pbat_index_modifier = 0; | |
4989 | |
4990 if ($pbat){ | |
4991 $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1! | |
4992 } | |
4993 | |
4994 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) | |
4995 if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){ | |
4996 ### [Index 0, sequence originated from (converted) forward strand] | |
4997 $counting{CT_CT_count}++; | |
4998 $alignment_strand = '+'; | |
4999 $read_conversion_info = 'CT'; | |
5000 $genome_conversion = 'CT'; | |
5001 | |
5002 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5003 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 | |
5004 ### + 2 extra base at the 3' end | |
5005 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 | |
5006 } | |
5007 else{ | |
5008 $non_bisulfite_sequence = ''; | |
5009 } | |
5010 } | |
5011 | |
5012 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) | |
5013 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){ | |
5014 ### [Index 1, sequence originated from (converted) reverse strand] | |
5015 $counting{CT_GA_count}++; | |
5016 $alignment_strand = '-'; | |
5017 $read_conversion_info = 'CT'; | |
5018 $genome_conversion = 'GA'; | |
5019 | |
5020 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5021 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > | |
5022 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation | |
5023 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 | |
5024 ## reverse complement! | |
5025 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
5026 } | |
5027 else{ | |
5028 $non_bisulfite_sequence = ''; | |
5029 } | |
5030 } | |
5031 | |
5032 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) | |
5033 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){ | |
5034 ### [Index 2, sequence originated from complementary to (converted) forward strand] | |
5035 $counting{GA_CT_count}++; | |
5036 $alignment_strand = '-'; | |
5037 $read_conversion_info = 'GA'; | |
5038 $genome_conversion = 'CT'; | |
5039 | |
5040 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation | |
5041 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5042 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 | |
5043 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 | |
5044 ## reverse complement! | |
5045 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
5046 } | |
5047 else{ | |
5048 $non_bisulfite_sequence = ''; | |
5049 } | |
5050 } | |
5051 | |
5052 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) | |
5053 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){ | |
5054 ### [Index 3, sequence originated from complementary to (converted) reverse strand] | |
5055 $counting{GA_GA_count}++; | |
5056 $alignment_strand = '+'; | |
5057 $read_conversion_info = 'GA'; | |
5058 $genome_conversion = 'GA'; | |
5059 | |
5060 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5061 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > | |
5062 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand | |
5063 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 | |
5064 } | |
5065 else{ | |
5066 $non_bisulfite_sequence = ''; | |
5067 } | |
5068 } | |
5069 else{ | |
5070 die "Too many bowtie result filehandles\n"; | |
5071 } | |
5072 | |
5073 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; | |
5074 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; | |
5075 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
5076 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
5077 | |
5078 ### at this point we can also determine the end position of a read | |
5079 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); | |
5080 } | |
5081 | |
5082 | |
5083 sub extract_corresponding_genomic_sequence_single_end_bowtie2{ | |
5084 my ($sequence_identifier,$methylation_call_params) = @_; | |
5085 | |
5086 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{MD_tag}; | |
5087 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR}; | |
5088 | |
5089 my $contains_deletion = 0; | |
5090 if ($cigar =~ /D/){ | |
5091 $contains_deletion = 1; | |
5092 # warn "$cigar\n$MD_tag\n"; | |
5093 } | |
5094 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the | |
5095 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
5096 | |
5097 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
5098 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
5099 my $alignment_strand; | |
5100 my $read_conversion_info; | |
5101 my $genome_conversion; | |
5102 | |
5103 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and | |
5104 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence | |
5105 my $non_bisulfite_sequence = ''; | |
5106 my $genomic_seq_for_MD_tag = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output | |
5107 | |
5108 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings | |
5109 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1; | |
5110 | |
5111 # parsing CIGAR string | |
5112 my @len = split (/\D+/,$cigar); # storing the length per operation | |
5113 my @ops = split (/\d+/,$cigar); # storing the operation | |
5114 shift @ops; # remove the empty first element | |
5115 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
5116 | |
5117 my $pbat_index_modifier = 0; | |
5118 | |
5119 if ($pbat){ | |
5120 $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1! | |
5121 } | |
5122 | |
5123 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3) | |
5124 if ( (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1) or (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3) ){ | |
5125 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5126 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise | |
5127 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
5128 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; | |
5129 return; | |
5130 } | |
5131 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2); | |
5132 } | |
5133 | |
5134 my $indels = 0; | |
5135 | |
5136 foreach (0..$#len){ | |
5137 if ($ops[$_] eq 'M'){ | |
5138 #extracting genomic sequence | |
5139 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); | |
5140 if ($contains_deletion){ | |
5141 $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); | |
5142 } | |
5143 # adjusting position | |
5144 $pos += $len[$_]; | |
5145 } | |
5146 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence | |
5147 # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can later ignore it better during the generation of the MD:Z-tag | |
5148 $non_bisulfite_sequence .= 'X' x $len[$_]; | |
5149 if ($contains_deletion){ | |
5150 $genomic_seq_for_MD_tag .= 'X' x $len[$_]; | |
5151 } | |
5152 # warn "$non_bisulfite_sequence\n"; | |
5153 # position doesn't need to be adjusting | |
5154 | |
5155 ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail the base by base comparison in hemming_dist() | |
5156 # $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) | |
5157 } | |
5158 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence | |
5159 # we do not add any genomic sequence but only adjust the position | |
5160 | |
5161 # we do however add the genomic sequence to the $genomic_sequence for MD-tag determination if the CIGAR string contained a deletion | |
5162 if ($contains_deletion){ | |
5163 $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); | |
5164 } | |
5165 $pos += $len[$_]; | |
5166 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) | |
5167 } | |
5168 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die | |
5169 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
5170 } | |
5171 else{ | |
5172 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
5173 } | |
5174 } | |
5175 | |
5176 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2) | |
5177 if ( ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0) or ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2) ){ | |
5178 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5179 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise | |
5180 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
5181 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; | |
5182 return; | |
5183 } | |
5184 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2); | |
5185 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n"; | |
5186 } | |
5187 | |
5188 | |
5189 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) | |
5190 if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){ | |
5191 ### [Index 0, sequence originated from (converted) forward strand] | |
5192 $counting{CT_CT_count}++; | |
5193 $alignment_strand = '+'; | |
5194 $read_conversion_info = 'CT'; | |
5195 $genome_conversion = 'CT'; | |
5196 } | |
5197 | |
5198 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) | |
5199 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){ | |
5200 ### [Index 1, sequence originated from (converted) reverse strand] | |
5201 $counting{CT_GA_count}++; | |
5202 $alignment_strand = '-'; | |
5203 $read_conversion_info = 'CT'; | |
5204 $genome_conversion = 'GA'; | |
5205 | |
5206 ### reverse complement! | |
5207 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
5208 if ($contains_deletion){ | |
5209 $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag); | |
5210 } | |
5211 } | |
5212 | |
5213 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) | |
5214 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){ | |
5215 ### [Index 2, sequence originated from complementary to (converted) forward strand] | |
5216 $counting{GA_CT_count}++; | |
5217 $alignment_strand = '-'; | |
5218 $read_conversion_info = 'GA'; | |
5219 $genome_conversion = 'CT'; | |
5220 | |
5221 ### reverse complement! | |
5222 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
5223 if ($contains_deletion){ | |
5224 $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag); | |
5225 } | |
5226 } | |
5227 | |
5228 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) | |
5229 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){ | |
5230 ### [Index 3, sequence originated from complementary to (converted) reverse strand] | |
5231 $counting{GA_GA_count}++; | |
5232 $alignment_strand = '+'; | |
5233 $read_conversion_info = 'GA'; | |
5234 $genome_conversion = 'GA'; | |
5235 | |
5236 } | |
5237 else{ | |
5238 die "Too many Bowtie 2 result filehandles\n"; | |
5239 } | |
5240 | |
5241 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; | |
5242 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; | |
5243 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
5244 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
5245 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; | |
5246 | |
5247 # if ($contains_deletion){ | |
5248 # warn "non-bis: $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence}\n"; | |
5249 # warn "MD-seq: $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag}\n"; | |
5250 # } | |
5251 | |
5252 ### the end position of a read is stored in $pos | |
5253 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos; | |
5254 $methylation_call_params->{$sequence_identifier}->{indels} = $indels; | |
5255 } | |
5256 | |
5257 ### METHYLATION CALL | |
5258 | |
5259 sub methylation_call{ | |
5260 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_; | |
5261 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one | |
5262 my @seq = split(//,$sequence_actually_observed); | |
5263 my @genomic = split(//,$genomic_sequence); | |
5264 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n"; | |
5265 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either | |
5266 ### CpG, CHH or CHG context | |
5267 | |
5268 ################################################################# | |
5269 ### . for bases not involving cytosines ### | |
5270 ### X for methylated C in CHG context (was protected) ### | |
5271 ### x for not methylated C in CHG context (was converted) ### | |
5272 ### H for methylated C in CHH context (was protected) ### | |
5273 ### h for not methylated C in CHH context (was converted) ### | |
5274 ### Z for methylated C in CpG context (was protected) ### | |
5275 ### z for not methylated C in CpG context (was converted) ### | |
5276 ### U for methylated C in unknown context (was protected) ### | |
5277 ### u for not methylated C in unknwon context (was converted) ### | |
5278 ################################################################# | |
5279 | |
5280 my @match =(); | |
5281 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 | |
5282 my $methyl_CHH_count = 0; | |
5283 my $methyl_CHG_count = 0; | |
5284 my $methyl_CpG_count = 0; | |
5285 my $methyl_C_unknown_count = 0; | |
5286 my $unmethylated_CHH_count = 0; | |
5287 my $unmethylated_CHG_count = 0; | |
5288 my $unmethylated_CpG_count = 0; | |
5289 my $unmethylated_C_unknown_count = 0; | |
5290 | |
5291 if ($read_conversion eq 'CT'){ | |
5292 for my $index (0..$#seq) { | |
5293 if ($seq[$index] eq $genomic[$index]) { | |
5294 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation | |
5295 if ($genomic[$index] eq 'C') { | |
5296 ### If the residue is a C we want to know if it was in CpG context or in any other context | |
5297 my $downstream_base = $genomic[$index+1]; | |
5298 | |
5299 if ($downstream_base eq 'G'){ | |
5300 ++$methyl_CpG_count; | |
5301 push @match,'Z'; # protected C, methylated, in CpG context | |
5302 } | |
5303 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
5304 ++$methyl_C_unknown_count; | |
5305 push @match,'U'; # protected C, methylated, in Unknown context | |
5306 } | |
5307 else { | |
5308 ### C in not in CpG-context, determining the second downstream base context | |
5309 my $second_downstream_base = $genomic[$index+2]; | |
5310 | |
5311 if ($second_downstream_base eq 'G'){ | |
5312 ++$methyl_CHG_count; | |
5313 push @match,'X'; # protected C, methylated, in CHG context | |
5314 } | |
5315 elsif ($second_downstream_base eq 'N'){ | |
5316 ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
5317 push @match,'U'; # protected C, methylated, in Unknown context | |
5318 } | |
5319 else{ | |
5320 ++$methyl_CHH_count; | |
5321 push @match,'H'; # protected C, methylated, in CHH context | |
5322 } | |
5323 } | |
5324 } | |
5325 else { | |
5326 push @match, '.'; | |
5327 } | |
5328 } | |
5329 elsif ($seq[$index] ne $genomic[$index]) { | |
5330 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts | |
5331 ### in the actually observed sequence | |
5332 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') { | |
5333 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context | |
5334 my $downstream_base = $genomic[$index+1]; | |
5335 | |
5336 if ($downstream_base eq 'G'){ | |
5337 ++$unmethylated_CpG_count; | |
5338 push @match,'z'; # converted C, not methylated, in CpG context | |
5339 } | |
5340 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
5341 ++$unmethylated_C_unknown_count; | |
5342 push @match,'u'; # converted C, not methylated, in Unknown context | |
5343 } | |
5344 else{ | |
5345 ### C in not in CpG-context, determining the second downstream base context | |
5346 my $second_downstream_base = $genomic[$index+2]; | |
5347 | |
5348 if ($second_downstream_base eq 'G'){ | |
5349 ++$unmethylated_CHG_count; | |
5350 push @match,'x'; # converted C, not methylated, in CHG context | |
5351 } | |
5352 elsif ($second_downstream_base eq 'N'){ | |
5353 ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
5354 push @match,'u'; # converted C, not methylated, in Unknown context | |
5355 } | |
5356 else{ | |
5357 ++$unmethylated_CHH_count; | |
5358 push @match,'h'; # converted C, not methylated, in CHH context | |
5359 } | |
5360 } | |
5361 } | |
5362 ### all other mismatches are not of interest for a methylation call | |
5363 else { | |
5364 push @match,'.'; | |
5365 } | |
5366 } | |
5367 else{ | |
5368 die "There can be only 2 possibilities\n"; | |
5369 } | |
5370 } | |
5371 } | |
5372 elsif ($read_conversion eq 'GA'){ | |
5373 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n"; | |
5374 | |
5375 for my $index (0..$#seq) { | |
5376 if ($seq[$index] eq $genomic[$index+2]) { | |
5377 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation | |
5378 if ($genomic[$index+2] eq 'G') { | |
5379 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need | |
5380 ### to look if the base upstream is a C | |
5381 | |
5382 my $upstream_base = $genomic[$index+1]; | |
5383 | |
5384 if ($upstream_base eq 'C'){ | |
5385 ++$methyl_CpG_count; | |
5386 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context | |
5387 } | |
5388 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
5389 ++$methyl_C_unknown_count; | |
5390 push @match,'U'; # protected C on opposing strand, methylated, in Unknown context | |
5391 } | |
5392 else{ | |
5393 ### C in not in CpG-context, determining the second upstream base context | |
5394 my $second_upstream_base = $genomic[$index]; | |
5395 | |
5396 if ($second_upstream_base eq 'C'){ | |
5397 ++$methyl_CHG_count; | |
5398 push @match,'X'; # protected C on opposing strand, methylated, in CHG context | |
5399 } | |
5400 elsif ($second_upstream_base eq 'N'){ | |
5401 ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
5402 push @match,'U'; # protected C, methylated, in Unknown context | |
5403 } | |
5404 else{ | |
5405 ++$methyl_CHH_count; | |
5406 push @match,'H'; # protected C on opposing strand, methylated, in CHH context | |
5407 } | |
5408 } | |
5409 } | |
5410 else{ | |
5411 push @match, '.'; | |
5412 } | |
5413 } | |
5414 elsif ($seq[$index] ne $genomic[$index+2]) { | |
5415 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts | |
5416 ### on the opposing strand, so G to A conversions in the actually observed sequence | |
5417 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') { | |
5418 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if | |
5419 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream! | |
5420 | |
5421 my $upstream_base = $genomic[$index+1]; | |
5422 | |
5423 if ($upstream_base eq 'C'){ | |
5424 ++$unmethylated_CpG_count; | |
5425 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context | |
5426 } | |
5427 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
5428 ++$unmethylated_C_unknown_count; | |
5429 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context | |
5430 } | |
5431 else{ | |
5432 ### C in not in CpG-context, determining the second upstream base context | |
5433 my $second_upstream_base = $genomic[$index]; | |
5434 | |
5435 if ($second_upstream_base eq 'C'){ | |
5436 ++$unmethylated_CHG_count; | |
5437 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context | |
5438 } | |
5439 elsif ($second_upstream_base eq 'N'){ | |
5440 ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
5441 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context | |
5442 } | |
5443 else{ | |
5444 ++$unmethylated_CHH_count; | |
5445 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context | |
5446 } | |
5447 } | |
5448 } | |
5449 ### all other mismatches are not of interest for a methylation call | |
5450 else { | |
5451 push @match,'.'; | |
5452 } | |
5453 } | |
5454 else{ | |
5455 die "There can be only 2 possibilities\n"; | |
5456 } | |
5457 } | |
5458 } | |
5459 else{ | |
5460 die "Strand conversion info is required to perform a methylation call\n"; | |
5461 } | |
5462 | |
5463 my $methylation_call = join ("",@match); | |
5464 | |
5465 $counting{total_meCHH_count} += $methyl_CHH_count; | |
5466 $counting{total_meCHG_count} += $methyl_CHG_count; | |
5467 $counting{total_meCpG_count} += $methyl_CpG_count; | |
5468 $counting{total_meC_unknown_count} += $methyl_C_unknown_count; | |
5469 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; | |
5470 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; | |
5471 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; | |
5472 $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; | |
5473 | |
5474 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; | |
5475 | |
5476 return $methylation_call; | |
5477 } | |
5478 | |
5479 sub read_genome_into_memory{ | |
5480 ## working directoy | |
5481 my $cwd = shift; | |
5482 ## reading in and storing the specified genome in the %chromosomes hash | |
5483 chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; | |
5484 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; | |
5485 | |
5486 my @chromosome_filenames = <*.fa>; | |
5487 | |
5488 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
5489 unless (@chromosome_filenames){ | |
5490 @chromosome_filenames = <*.fasta>; | |
5491 } | |
5492 | |
5493 unless (@chromosome_filenames){ | |
5494 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; | |
5495 } | |
5496 | |
5497 my $SQ_count = 0; | |
5498 | |
5499 foreach my $chromosome_filename (@chromosome_filenames){ | |
5500 | |
5501 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; | |
5502 ### first line needs to be a fastA header | |
5503 my $first_line = <CHR_IN>; | |
5504 chomp $first_line; | |
5505 $first_line =~ s/\r//; | |
5506 ### Extracting chromosome name from the FastA header | |
5507 my $chromosome_name = extract_chromosome_name($first_line); | |
5508 my $sequence; | |
5509 | |
5510 while (<CHR_IN>){ | |
5511 chomp; | |
5512 $_ =~ s/\r//; # removing carriage returns if present | |
5513 if ($_ =~ /^>/){ | |
5514 | |
5515 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) | |
5516 if (exists $chromosomes{$chromosome_name}){ | |
5517 print "chr $chromosome_name (",length $sequence ," bp)\n"; | |
5518 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; | |
5519 } | |
5520 else { | |
5521 if (length($sequence) == 0){ | |
5522 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; | |
5523 } | |
5524 print "chr $chromosome_name (",length $sequence ," bp)\n"; | |
5525 $chromosomes{$chromosome_name} = $sequence; | |
5526 $SQ_order{$SQ_count} = $chromosome_name; | |
5527 | |
5528 ++$SQ_count; | |
5529 | |
5530 } | |
5531 ### resetting the sequence variable | |
5532 $sequence = ''; | |
5533 ### setting new chromosome name | |
5534 $chromosome_name = extract_chromosome_name($_); | |
5535 } | |
5536 else{ | |
5537 $sequence .= uc$_; | |
5538 } | |
5539 } | |
5540 | |
5541 ### Processing last chromosome of a multi Fasta File or the only entry in case of single entry FastA files | |
5542 | |
5543 if (exists $chromosomes{$chromosome_name}){ | |
5544 print "chr $chromosome_name (",length $sequence ," bp)\t"; | |
5545 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; | |
5546 } | |
5547 else{ | |
5548 if (length($sequence) == 0){ | |
5549 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; | |
5550 } | |
5551 | |
5552 ++$SQ_count; | |
5553 | |
5554 print "chr $chromosome_name (",length $sequence ," bp)\n"; | |
5555 $chromosomes{$chromosome_name} = $sequence; | |
5556 $SQ_order{$SQ_count} = $chromosome_name; | |
5557 } | |
5558 } | |
5559 print "\n"; | |
5560 chdir $cwd or die "Failed to move to directory $cwd\n"; | |
5561 } | |
5562 | |
5563 sub extract_chromosome_name { | |
5564 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well | |
5565 my $fasta_header = shift; | |
5566 if ($fasta_header =~ s/^>//){ | |
5567 my ($chromosome_name) = split (/\s+/,$fasta_header); | |
5568 return $chromosome_name; | |
5569 } | |
5570 else{ | |
5571 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; | |
5572 } | |
5573 } | |
5574 | |
5575 sub reverse_complement{ | |
5576 my $sequence = shift; | |
5577 $sequence =~ tr/CATG/GTAC/; | |
5578 $sequence = reverse($sequence); | |
5579 return $sequence; | |
5580 } | |
5581 | |
5582 sub biTransformFastAFiles { | |
5583 my $file = shift; | |
5584 my ($dir,$filename); | |
5585 if ($file =~ /\//){ | |
5586 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
5587 } | |
5588 else{ | |
5589 $filename = $file; | |
5590 } | |
5591 | |
5592 ### gzipped version of the infile | |
5593 if ($file =~ /\.gz$/){ | |
5594 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; | |
5595 } | |
5596 else{ | |
5597 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
5598 } | |
5599 | |
5600 if ($skip){ | |
5601 warn "Skipping the first $skip reads from $file\n"; | |
5602 sleep (1); | |
5603 } | |
5604 if ($upto){ | |
5605 warn "Processing reads up to sequence no. $upto from $file\n"; | |
5606 sleep (1); | |
5607 } | |
5608 | |
5609 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
5610 | |
5611 if ($gzip){ | |
5612 $C_to_T_infile =~ s/$/_C_to_T.fa.gz/; | |
5613 $G_to_A_infile =~ s/$/_G_to_A.fa.gz/; | |
5614 } | |
5615 else{ | |
5616 $C_to_T_infile =~ s/$/_C_to_T.fa/; | |
5617 $G_to_A_infile =~ s/$/_G_to_A.fa/; | |
5618 } | |
5619 | |
5620 if ($prefix){ | |
5621 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
5622 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
5623 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
5624 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
5625 } | |
5626 | |
5627 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
5628 | |
5629 if ($gzip){ | |
5630 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
5631 } | |
5632 else{ | |
5633 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
5634 } | |
5635 | |
5636 unless ($directional){ | |
5637 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
5638 if ($gzip){ | |
5639 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
5640 } | |
5641 else{ | |
5642 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
5643 } | |
5644 } | |
5645 | |
5646 my $count = 0; | |
5647 | |
5648 while (1){ | |
5649 my $header = <IN>; | |
5650 my $sequence= <IN>; | |
5651 last unless ($header and $sequence); | |
5652 | |
5653 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces | |
5654 | |
5655 ++$count; | |
5656 | |
5657 if ($skip){ | |
5658 next unless ($count > $skip); | |
5659 } | |
5660 if ($upto){ | |
5661 last if ($count > $upto); | |
5662 } | |
5663 | |
5664 $sequence = uc$sequence; # make input file case insensitive | |
5665 | |
5666 # detecting if the input file contains tab stops, as this is likely to result in no alignments | |
5667 if (index($header,"\t") != -1){ | |
5668 $seqID_contains_tabs++; | |
5669 } | |
5670 | |
5671 ### small check if the sequence seems to be in FastA format | |
5672 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); | |
5673 | |
5674 my $sequence_C_to_T = $sequence; | |
5675 $sequence_C_to_T =~ tr/C/T/; | |
5676 print CTOT "$header$sequence_C_to_T"; | |
5677 | |
5678 unless ($directional){ | |
5679 my $sequence_G_to_A = $sequence; | |
5680 $sequence_G_to_A =~ tr/G/A/; | |
5681 print GTOA "$header$sequence_G_to_A"; | |
5682 } | |
5683 } | |
5684 close CTOT or die "Failed to close filehandle $!\n"; | |
5685 | |
5686 if ($directional){ | |
5687 warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n"; | |
5688 } | |
5689 else{ | |
5690 close GTOA or die "Failed to close filehandle $!\n"; | |
5691 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; | |
5692 } | |
5693 return ($C_to_T_infile,$G_to_A_infile); | |
5694 } | |
5695 | |
5696 sub biTransformFastAFiles_paired_end { | |
5697 my ($file,$read_number) = @_; | |
5698 | |
5699 if ($gzip){ | |
5700 warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n"; | |
5701 sleep (2); | |
5702 } | |
5703 | |
5704 my ($dir,$filename); | |
5705 if ($file =~ /\//){ | |
5706 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
5707 } | |
5708 else{ | |
5709 $filename = $file; | |
5710 } | |
5711 | |
5712 ### gzipped version of the infile | |
5713 if ($file =~ /\.gz$/){ | |
5714 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; | |
5715 } | |
5716 else{ | |
5717 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
5718 } | |
5719 | |
5720 if ($skip){ | |
5721 warn "Skipping the first $skip reads from $file\n"; | |
5722 sleep (1); | |
5723 } | |
5724 if ($upto){ | |
5725 warn "Processing reads up to sequence no. $upto from $file\n"; | |
5726 sleep (1); | |
5727 } | |
5728 | |
5729 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
5730 | |
5731 $C_to_T_infile =~ s/$/_C_to_T.fa/; | |
5732 $G_to_A_infile =~ s/$/_G_to_A.fa/; | |
5733 | |
5734 if ($prefix){ | |
5735 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
5736 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
5737 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
5738 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
5739 } | |
5740 | |
5741 if ($directional){ | |
5742 if ($read_number == 1){ | |
5743 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
5744 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
5745 } | |
5746 elsif ($read_number == 2){ | |
5747 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
5748 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
5749 } | |
5750 else{ | |
5751 die "Read number needs to be 1 or 2, but was: $read_number\n\n"; | |
5752 } | |
5753 } | |
5754 else{ # all four strand output | |
5755 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
5756 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
5757 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
5758 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
5759 } | |
5760 | |
5761 my $count = 0; | |
5762 | |
5763 while (1){ | |
5764 my $header = <IN>; | |
5765 my $sequence= <IN>; | |
5766 last unless ($header and $sequence); | |
5767 | |
5768 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces | |
5769 | |
5770 ++$count; | |
5771 | |
5772 if ($skip){ | |
5773 next unless ($count > $skip); | |
5774 } | |
5775 if ($upto){ | |
5776 last if ($count > $upto); | |
5777 } | |
5778 | |
5779 $sequence = uc$sequence; # make input file case insensitive | |
5780 | |
5781 # detecting if the input file contains tab stops, as this is likely to result in no alignments | |
5782 if (index($header,"\t") != -1){ | |
5783 $seqID_contains_tabs++; | |
5784 } | |
5785 | |
5786 ## small check if the sequence seems to be in FastA format | |
5787 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/); | |
5788 | |
5789 if ($read_number == 1){ | |
5790 if ($bowtie2){ | |
5791 $header =~ s/$/\/1\/1/; | |
5792 } | |
5793 else{ | |
5794 $header =~ s/$/\/1/; | |
5795 } | |
5796 } | |
5797 elsif ($read_number == 2){ | |
5798 if ($bowtie2){ | |
5799 $header =~ s/$/\/2\/2/; | |
5800 } | |
5801 else{ | |
5802 $header =~ s/$/\/2/; | |
5803 } | |
5804 } | |
5805 else{ | |
5806 die "Read number needs to be 1 or 2, but was: $read_number\n\n"; | |
5807 } | |
5808 my $sequence_C_to_T = my $sequence_G_to_A = $sequence; | |
5809 | |
5810 $sequence_C_to_T =~ tr/C/T/; | |
5811 $sequence_G_to_A =~ tr/G/A/; | |
5812 | |
5813 if ($directional){ | |
5814 | |
5815 if ($read_number == 1){ | |
5816 print CTOT "$header$sequence_C_to_T"; | |
5817 } | |
5818 elsif ($read_number == 2){ | |
5819 print GTOA "$header$sequence_G_to_A"; | |
5820 } | |
5821 } | |
5822 else{ | |
5823 print CTOT "$header$sequence_C_to_T"; | |
5824 print GTOA "$header$sequence_G_to_A"; | |
5825 } | |
5826 } | |
5827 | |
5828 if ($directional){ | |
5829 if ($read_number == 1){ | |
5830 warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n"; | |
5831 } | |
5832 else{ | |
5833 warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n"; | |
5834 } | |
5835 } | |
5836 else{ | |
5837 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; | |
5838 } | |
5839 | |
5840 if ($directional){ | |
5841 if ($read_number == 1){ | |
5842 return ($C_to_T_infile); | |
5843 } | |
5844 else{ | |
5845 return ($G_to_A_infile); | |
5846 } | |
5847 } | |
5848 else{ | |
5849 return ($C_to_T_infile,$G_to_A_infile); | |
5850 } | |
5851 } | |
5852 | |
5853 | |
5854 sub biTransformFastQFiles { | |
5855 my $file = shift; | |
5856 my ($dir,$filename); | |
5857 if ($file =~ /\//){ | |
5858 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
5859 } | |
5860 else{ | |
5861 $filename = $file; | |
5862 } | |
5863 | |
5864 ### gzipped version of the infile | |
5865 if ($file =~ /\.gz$/){ | |
5866 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; | |
5867 } | |
5868 else{ | |
5869 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
5870 } | |
5871 | |
5872 if ($skip){ | |
5873 warn "Skipping the first $skip reads from $file\n"; | |
5874 sleep (1); | |
5875 } | |
5876 if ($upto){ | |
5877 warn "Processing reads up to sequence no. $upto from $file\n"; | |
5878 sleep (1); | |
5879 } | |
5880 | |
5881 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
5882 | |
5883 if ($prefix){ | |
5884 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
5885 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
5886 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
5887 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
5888 } | |
5889 | |
5890 if ($pbat){ # PBAT-Seq | |
5891 if ($gzip){ | |
5892 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | |
5893 } | |
5894 else{ | |
5895 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | |
5896 } | |
5897 | |
5898 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
5899 | |
5900 if ($gzip){ | |
5901 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
5902 } | |
5903 else{ | |
5904 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
5905 } | |
5906 } | |
5907 else{ # directional or non-directional | |
5908 if ($gzip){ | |
5909 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; | |
5910 } | |
5911 else{ | |
5912 $C_to_T_infile =~ s/$/_C_to_T.fastq/; | |
5913 } | |
5914 | |
5915 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
5916 | |
5917 if ($gzip){ | |
5918 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
5919 } | |
5920 else{ | |
5921 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option | |
5922 } | |
5923 | |
5924 unless ($directional){ | |
5925 if ($gzip){ | |
5926 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | |
5927 } | |
5928 else{ | |
5929 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | |
5930 } | |
5931 | |
5932 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
5933 | |
5934 if ($gzip){ | |
5935 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
5936 } | |
5937 else{ | |
5938 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
5939 } | |
5940 } | |
5941 } | |
5942 | |
5943 my $count = 0; | |
5944 while (1){ | |
5945 my $identifier = <IN>; | |
5946 my $sequence = <IN>; | |
5947 my $identifier2 = <IN>; | |
5948 my $quality_score = <IN>; | |
5949 last unless ($identifier and $sequence and $identifier2 and $quality_score); | |
5950 | |
5951 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
5952 | |
5953 ++$count; | |
5954 | |
5955 if ($skip){ | |
5956 next unless ($count > $skip); | |
5957 } | |
5958 if ($upto){ | |
5959 last if ($count > $upto); | |
5960 } | |
5961 | |
5962 $sequence = uc$sequence; # make input file case insensitive | |
5963 | |
5964 # detecting if the input file contains tab stops, as this is likely to result in no alignments | |
5965 if (index($identifier,"\t") != -1){ | |
5966 $seqID_contains_tabs++; | |
5967 } | |
5968 | |
5969 ## small check if the sequence file appears to be a FastQ file | |
5970 if ($count == 1){ | |
5971 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ | |
5972 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
5973 } | |
5974 } | |
5975 | |
5976 if ($pbat){ | |
5977 my $sequence_G_to_A = $sequence; | |
5978 $sequence_G_to_A =~ tr/G/A/; | |
5979 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
5980 } | |
5981 else{ # directional or non-directional | |
5982 my $sequence_C_to_T = $sequence; | |
5983 $sequence_C_to_T =~ tr/C/T/; | |
5984 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); | |
5985 | |
5986 unless ($directional){ | |
5987 my $sequence_G_to_A = $sequence; | |
5988 $sequence_G_to_A =~ tr/G/A/; | |
5989 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
5990 } | |
5991 } | |
5992 } | |
5993 | |
5994 if ($directional){ | |
5995 close CTOT or die "Failed to close filehandle $!\n"; | |
5996 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
5997 } | |
5998 elsif($pbat){ | |
5999 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
6000 close GTOA or die "Failed to close filehandle $!\n"; | |
6001 return ($G_to_A_infile); | |
6002 } | |
6003 else{ | |
6004 close CTOT or die "Failed to close filehandle $!\n"; | |
6005 close GTOA or die "Failed to close filehandle $!\n"; | |
6006 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; | |
6007 } | |
6008 | |
6009 return ($C_to_T_infile,$G_to_A_infile); | |
6010 } | |
6011 | |
6012 sub biTransformFastQFiles_paired_end { | |
6013 my ($file,$read_number) = @_; | |
6014 my ($dir,$filename); | |
6015 | |
6016 if ($file =~ /\//){ | |
6017 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
6018 } | |
6019 else{ | |
6020 $filename = $file; | |
6021 } | |
6022 | |
6023 ### gzipped version of the infile | |
6024 if ($file =~ /\.gz$/){ | |
6025 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n"; | |
6026 } | |
6027 else{ | |
6028 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
6029 } | |
6030 | |
6031 if ($skip){ | |
6032 warn "Skipping the first $skip reads from $file\n"; | |
6033 sleep (1); | |
6034 } | |
6035 if ($upto){ | |
6036 warn "Processing reads up to sequence no. $upto from $file\n"; | |
6037 sleep (1); | |
6038 } | |
6039 | |
6040 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
6041 | |
6042 if ($gzip){ | |
6043 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; | |
6044 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | |
6045 } | |
6046 else{ | |
6047 $C_to_T_infile =~ s/$/_C_to_T.fastq/; | |
6048 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | |
6049 } | |
6050 | |
6051 if ($prefix){ | |
6052 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
6053 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
6054 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
6055 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
6056 } | |
6057 | |
6058 if ($directional){ | |
6059 if ($read_number == 1){ | |
6060 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
6061 if ($gzip){ | |
6062 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
6063 } | |
6064 else{ | |
6065 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
6066 } | |
6067 } | |
6068 elsif ($read_number == 2){ | |
6069 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
6070 if ($gzip){ | |
6071 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
6072 } | |
6073 else{ | |
6074 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
6075 } | |
6076 } | |
6077 else{ | |
6078 die "Read number needs to be 1 or 2, but was $read_number!\n\n"; | |
6079 } | |
6080 } | |
6081 else{ | |
6082 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
6083 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
6084 if ($gzip){ | |
6085 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
6086 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
6087 } | |
6088 else{ | |
6089 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
6090 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
6091 } | |
6092 } | |
6093 | |
6094 my $count = 0; | |
6095 while (1){ | |
6096 my $identifier = <IN>; | |
6097 my $sequence = <IN>; | |
6098 my $identifier2 = <IN>; | |
6099 my $quality_score = <IN>; | |
6100 last unless ($identifier and $sequence and $identifier2 and $quality_score); | |
6101 ++$count; | |
6102 | |
6103 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
6104 | |
6105 if ($skip){ | |
6106 next unless ($count > $skip); | |
6107 } | |
6108 if ($upto){ | |
6109 last if ($count > $upto); | |
6110 } | |
6111 | |
6112 $sequence= uc$sequence; # make input file case insensitive | |
6113 | |
6114 ## small check if the sequence file appears to be a FastQ file | |
6115 if ($count == 1){ | |
6116 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ | |
6117 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
6118 } | |
6119 } | |
6120 my $sequence_C_to_T = my $sequence_G_to_A = $sequence; | |
6121 | |
6122 if ($read_number == 1){ | |
6123 if ($bowtie2){ | |
6124 $identifier =~ s/$/\/1\/1/; | |
6125 } | |
6126 else{ | |
6127 $identifier =~ s/$/\/1/; | |
6128 } | |
6129 } | |
6130 elsif ($read_number == 2){ | |
6131 if ($bowtie2){ | |
6132 $identifier =~ s/$/\/2\/2/; | |
6133 } | |
6134 else{ | |
6135 $identifier =~ s/$/\/2/; | |
6136 } | |
6137 } | |
6138 else{ | |
6139 die "Read number needs to be 1 or 2\n"; | |
6140 } | |
6141 | |
6142 $sequence_C_to_T =~ tr/C/T/; | |
6143 $sequence_G_to_A =~ tr/G/A/; | |
6144 | |
6145 if ($directional){ | |
6146 if ($read_number == 1){ | |
6147 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); | |
6148 } | |
6149 else{ | |
6150 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
6151 } | |
6152 } | |
6153 else{ | |
6154 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); | |
6155 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
6156 } | |
6157 } | |
6158 | |
6159 if ($directional){ | |
6160 if ($read_number == 1){ | |
6161 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
6162 } | |
6163 else{ | |
6164 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
6165 } | |
6166 } | |
6167 else{ | |
6168 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; | |
6169 } | |
6170 if ($directional){ | |
6171 if ($read_number == 1){ | |
6172 close CTOT or die "Failed to close filehandle $!\n"; | |
6173 return ($C_to_T_infile); | |
6174 } | |
6175 else{ | |
6176 close GTOA or die "Failed to close filehandle $!\n"; | |
6177 return ($G_to_A_infile); | |
6178 } | |
6179 } | |
6180 else{ | |
6181 close CTOT or die "Failed to close filehandle $!\n"; | |
6182 close GTOA or die "Failed to close filehandle $!\n"; | |
6183 return ($C_to_T_infile,$G_to_A_infile); | |
6184 } | |
6185 } | |
6186 | |
6187 | |
6188 ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES | |
6189 | |
6190 sub biTransformFastQFiles_paired_end_bowtie1_gzip { | |
6191 my ($file_1,$file_2) = @_; | |
6192 my ($dir,$filename); | |
6193 | |
6194 if ($file_1 =~ /\//){ | |
6195 ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/; | |
6196 } | |
6197 else{ | |
6198 $filename = $file_1; | |
6199 } | |
6200 | |
6201 ### gzipped version of infile 1 | |
6202 if ($file_1 =~ /\.gz$/){ | |
6203 open (IN_1,"zcat $file_1 |") or die "Couldn't read from file $file_1: $!\n"; | |
6204 } | |
6205 else{ | |
6206 open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n"; | |
6207 } | |
6208 ### gzipped version of infile 2 | |
6209 if ($file_2 =~ /\.gz$/){ | |
6210 open (IN_2,"zcat $file_2 |") or die "Couldn't read from file $file_2: $!\n"; | |
6211 } | |
6212 else{ | |
6213 open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n"; | |
6214 } | |
6215 | |
6216 | |
6217 if ($skip){ | |
6218 warn "Skipping the first $skip reads from $file_1 and $file_2\n"; | |
6219 sleep (1); | |
6220 } | |
6221 if ($upto){ | |
6222 warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n"; | |
6223 sleep (1); | |
6224 } | |
6225 | |
6226 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; | |
6227 | |
6228 if ($prefix){ | |
6229 # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; | |
6230 $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; | |
6231 $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; | |
6232 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; | |
6233 } | |
6234 | |
6235 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; | |
6236 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; | |
6237 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; | |
6238 | |
6239 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; | |
6240 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; | |
6241 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; | |
6242 | |
6243 unless ($directional){ | |
6244 print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n"; | |
6245 open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n"; | |
6246 } | |
6247 | |
6248 ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format: | |
6249 ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate> | |
6250 | |
6251 my $count = 0; | |
6252 while (1){ | |
6253 my $identifier_1 = <IN_1>; | |
6254 my $sequence_1 = <IN_1>; | |
6255 my $identifier2_1 = <IN_1>; | |
6256 my $quality_score_1 = <IN_1>; | |
6257 | |
6258 my $identifier_2 = <IN_2>; | |
6259 my $sequence_2 = <IN_2>; | |
6260 my $identifier2_2 = <IN_2>; | |
6261 my $quality_score_2 = <IN_2>; | |
6262 | |
6263 last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2); | |
6264 | |
6265 ++$count; | |
6266 | |
6267 ## small check if the sequence file appears to be a FastQ file | |
6268 if ($count == 1){ | |
6269 if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){ | |
6270 die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
6271 } | |
6272 if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){ | |
6273 die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
6274 } | |
6275 } | |
6276 | |
6277 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces | |
6278 chomp $identifier_1; | |
6279 chomp $sequence_1; | |
6280 chomp $sequence_2; | |
6281 chomp $quality_score_1; | |
6282 chomp $quality_score_2; | |
6283 | |
6284 $identifier_1 =~ s/^\@//; | |
6285 $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever) | |
6286 | |
6287 if ($skip){ | |
6288 next unless ($count > $skip); | |
6289 } | |
6290 if ($upto){ | |
6291 last if ($count > $upto); | |
6292 } | |
6293 | |
6294 $sequence_1 = uc$sequence_1; # make input file 1 case insensitive | |
6295 $sequence_2 = uc$sequence_2; # make input file 2 case insensitive | |
6296 | |
6297 # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n"; | |
6298 my $sequence_1_C_to_T = $sequence_1; | |
6299 my $sequence_2_G_to_A = $sequence_2; | |
6300 $sequence_1_C_to_T =~ tr/C/T/; | |
6301 $sequence_2_G_to_A =~ tr/G/A/; | |
6302 | |
6303 print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n"; | |
6304 | |
6305 unless ($directional){ | |
6306 my $sequence_1_G_to_A = $sequence_1; | |
6307 my $sequence_2_C_to_T = $sequence_2; | |
6308 $sequence_1_G_to_A =~ tr/G/A/; | |
6309 $sequence_2_C_to_T =~ tr/C/T/; | |
6310 print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n"; | |
6311 } | |
6312 } | |
6313 | |
6314 close CTPLUSGA or die "Couldn't close filehandle\n"; | |
6315 warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n"; | |
6316 | |
6317 if ($directional){ | |
6318 warn "\n"; | |
6319 return ($CT_plus_GA_infile); | |
6320 } | |
6321 else{ | |
6322 close GAPLUSCT or die "Couldn't close filehandle\n"; | |
6323 warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n"; | |
6324 return ($CT_plus_GA_infile,$GA_plus_CT_infile); | |
6325 } | |
6326 } | |
6327 | |
6328 | |
6329 sub fix_IDs{ | |
6330 my $id = shift; | |
6331 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores | |
6332 return $id; | |
6333 } | |
6334 | |
6335 sub ensure_sensical_alignment_orientation_single_end{ | |
6336 my $index = shift; # index number if the sequence produced an alignment | |
6337 my $strand = shift; | |
6338 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one | |
6339 my $orientation = 0; | |
6340 ############################################################################################################## | |
6341 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T) | |
6342 ## here we only want reads in the forward (+) orientation | |
6343 if ($fhs[$index]->{name} eq 'CTreadCTgenome') { | |
6344 ### if the alignment is (+) we count it, and return 1 for a correct orientation | |
6345 if ($strand eq '+') { | |
6346 $fhs[$index]->{seen}++; | |
6347 $orientation = 1; | |
6348 return $orientation; | |
6349 } | |
6350 ### if the orientation equals (-) the alignment is nonsensical | |
6351 elsif ($strand eq '-') { | |
6352 $fhs[$index]->{wrong_strand}++; | |
6353 return $orientation; | |
6354 } | |
6355 } | |
6356 ############################################################################################################### | |
6357 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A) | |
6358 ## here we only want reads in the forward (-) orientation | |
6359 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') { | |
6360 ### if the alignment is (-) we count it and return 1 for a correct orientation | |
6361 if ($strand eq '-') { | |
6362 $fhs[$index]->{seen}++; | |
6363 $orientation = 1; | |
6364 return $orientation; | |
6365 } | |
6366 ### if the orientation equals (+) the alignment is nonsensical | |
6367 elsif ($strand eq '+') { | |
6368 $fhs[$index]->{wrong_strand}++; | |
6369 return $orientation; | |
6370 } | |
6371 } | |
6372 ############################################################################################################### | |
6373 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T) | |
6374 ## here we only want reads in the forward (-) orientation | |
6375 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') { | |
6376 ### if the alignment is (-) we count it and return 1 for a correct orientation | |
6377 if ($strand eq '-') { | |
6378 $fhs[$index]->{seen}++; | |
6379 $orientation = 1; | |
6380 return $orientation; | |
6381 } | |
6382 ### if the orientation equals (+) the alignment is nonsensical | |
6383 elsif ($strand eq '+') { | |
6384 $fhs[$index]->{wrong_strand}++; | |
6385 return $orientation; | |
6386 } | |
6387 } | |
6388 ############################################################################################################### | |
6389 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A) | |
6390 ## here we only want reads in the forward (+) orientation | |
6391 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') { | |
6392 ### if the alignment is (+) we count it and return 1 for a correct orientation | |
6393 if ($strand eq '+') { | |
6394 $fhs[$index]->{seen}++; | |
6395 $orientation = 1; | |
6396 return $orientation; | |
6397 } | |
6398 ### if the orientation equals (-) the alignment is nonsensical | |
6399 elsif ($strand eq '-') { | |
6400 $fhs[$index]->{wrong_strand}++; | |
6401 return $orientation; | |
6402 } | |
6403 } else{ | |
6404 die "One of the above conditions must be true\n"; | |
6405 } | |
6406 } | |
6407 | |
6408 sub ensure_sensical_alignment_orientation_paired_ends{ | |
6409 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment | |
6410 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one | |
6411 my $orientation = 0; | |
6412 ############################################################################################################## | |
6413 ## [Index 0, sequence originated from (converted) forward strand] | |
6414 ## CT converted read 1 | |
6415 ## GA converted read 2 | |
6416 ## CT converted genome | |
6417 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation | |
6418 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') { | |
6419 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation | |
6420 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
6421 $fhs[$index]->{seen}++; | |
6422 $orientation = 1; | |
6423 return $orientation; | |
6424 } | |
6425 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
6426 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
6427 $fhs[$index]->{wrong_strand}++; | |
6428 return $orientation; | |
6429 } | |
6430 else{ | |
6431 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
6432 } | |
6433 } | |
6434 ############################################################################################################### | |
6435 ## [Index 1, sequence originated from (converted) reverse strand] | |
6436 ## GA converted read 1 | |
6437 ## CT converted read 2 | |
6438 ## GA converted genome | |
6439 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation | |
6440 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') { | |
6441 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation | |
6442 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
6443 $fhs[$index]->{seen}++; | |
6444 $orientation = 1; | |
6445 return $orientation; | |
6446 } | |
6447 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
6448 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
6449 $fhs[$index]->{wrong_strand}++; | |
6450 return $orientation; | |
6451 } | |
6452 else{ | |
6453 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
6454 } | |
6455 } | |
6456 ############################################################################################################### | |
6457 ## [Index 2, sequence originated from complementary to (converted) forward strand] | |
6458 ## GA converted read 1 | |
6459 ## CT converted read 2 | |
6460 ## CT converted genome | |
6461 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation | |
6462 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') { | |
6463 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation | |
6464 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
6465 $fhs[$index]->{seen}++; | |
6466 $orientation = 1; | |
6467 return $orientation; | |
6468 } | |
6469 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
6470 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
6471 $fhs[$index]->{wrong_strand}++; | |
6472 return $orientation; | |
6473 } | |
6474 else{ | |
6475 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
6476 } | |
6477 } | |
6478 ############################################################################################################### | |
6479 ## [Index 3, sequence originated from complementary to (converted) reverse strand] | |
6480 ## CT converted read 1 | |
6481 ## GA converted read 2 | |
6482 ## GA converted genome | |
6483 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation | |
6484 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') { | |
6485 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation | |
6486 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
6487 $fhs[$index]->{seen}++; | |
6488 $orientation = 1; | |
6489 return $orientation; | |
6490 } | |
6491 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
6492 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
6493 $fhs[$index]->{wrong_strand}++; | |
6494 return $orientation; | |
6495 } | |
6496 else{ | |
6497 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
6498 } | |
6499 } | |
6500 else{ | |
6501 die "One of the above conditions must be true\n"; | |
6502 } | |
6503 } | |
6504 | |
6505 ##################################################################################################################################################### | |
6506 | |
6507 ### Bowtie 1 (default) | PAIRED-END | FASTA | |
6508 | |
6509 sub paired_end_align_fragments_to_bisulfite_genome_fastA { | |
6510 | |
6511 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
6512 | |
6513 if ($directional){ | |
6514 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; | |
6515 } | |
6516 else{ | |
6517 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; | |
6518 } | |
6519 | |
6520 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
6521 ## data structure above | |
6522 if ($directional){ | |
6523 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6524 } | |
6525 else{ | |
6526 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6527 } | |
6528 | |
6529 foreach my $fh (@fhs) { | |
6530 | |
6531 if ($directional){ | |
6532 unless ($fh->{inputfile_1}){ | |
6533 $fh->{last_seq_id} = undef; | |
6534 $fh->{last_line_1} = undef; | |
6535 $fh->{last_line_2} = undef; | |
6536 next; | |
6537 } | |
6538 } | |
6539 | |
6540 my $bt_options = $bowtie_options; | |
6541 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
6542 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
6543 } | |
6544 else { | |
6545 $bt_options .= ' --nofw'; | |
6546 } | |
6547 | |
6548 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n"; | |
6549 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
6550 | |
6551 my $line_1 = $fh->{fh}->getline(); | |
6552 my $line_2 = $fh->{fh}->getline(); | |
6553 | |
6554 # if Bowtie produces an alignment we store the first line of the output | |
6555 if ($line_1 and $line_2) { | |
6556 chomp $line_1; | |
6557 chomp $line_2; | |
6558 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
6559 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
6560 | |
6561 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
6562 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
6563 | |
6564 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present | |
6565 $fh->{last_seq_id} = $id_1; | |
6566 } | |
6567 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present | |
6568 $fh->{last_seq_id} = $id_2; | |
6569 } | |
6570 else{ | |
6571 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
6572 } | |
6573 | |
6574 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 | |
6575 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 | |
6576 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
6577 } | |
6578 # otherwise we just initialise last_seq_id and last_lines as undefined | |
6579 else { | |
6580 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
6581 $fh->{last_seq_id} = undef; | |
6582 $fh->{last_line_1} = undef; | |
6583 $fh->{last_line_2} = undef; | |
6584 } | |
6585 } | |
6586 } | |
6587 | |
6588 ### Bowtie 2 | PAIRED-END | FASTA | |
6589 | |
6590 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { | |
6591 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
6592 if ($directional){ | |
6593 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; | |
6594 } | |
6595 else{ | |
6596 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; | |
6597 } | |
6598 | |
6599 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
6600 ## data structure above | |
6601 if ($directional){ | |
6602 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6603 } | |
6604 else{ | |
6605 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6606 } | |
6607 | |
6608 foreach my $fh (@fhs) { | |
6609 | |
6610 if ($directional){ | |
6611 unless ($fh->{inputfile_1}){ | |
6612 $fh->{last_seq_id} = undef; | |
6613 $fh->{last_line_1} = undef; | |
6614 $fh->{last_line_2} = undef; | |
6615 next; | |
6616 } | |
6617 } | |
6618 | |
6619 my $bt2_options = $bowtie_options; | |
6620 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
6621 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
6622 } | |
6623 else { | |
6624 $bt2_options .= ' --nofw'; | |
6625 } | |
6626 | |
6627 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; | |
6628 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
6629 | |
6630 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
6631 while (1){ | |
6632 $_ = $fh->{fh}->getline(); | |
6633 if ($_) { | |
6634 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
6635 } | |
6636 else{ | |
6637 last; # no alignment output | |
6638 } | |
6639 } | |
6640 | |
6641 my $line_1 = $_; | |
6642 my $line_2 = $fh->{fh}->getline(); | |
6643 | |
6644 # if Bowtie produces an alignment we store the first line of the output | |
6645 if ($line_1 and $line_2) { | |
6646 chomp $line_1; | |
6647 chomp $line_2; | |
6648 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
6649 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
6650 | |
6651 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
6652 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
6653 | |
6654 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with | |
6655 $fh->{last_seq_id} = $id_1; | |
6656 } | |
6657 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present | |
6658 $fh->{last_seq_id} = $id_2; | |
6659 } | |
6660 else{ | |
6661 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
6662 } | |
6663 | |
6664 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 | |
6665 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 | |
6666 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
6667 } | |
6668 # otherwise we just initialise last_seq_id and last_lines as undefined | |
6669 else { | |
6670 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
6671 $fh->{last_seq_id} = undef; | |
6672 $fh->{last_line_1} = undef; | |
6673 $fh->{last_line_2} = undef; | |
6674 } | |
6675 } | |
6676 } | |
6677 | |
6678 ### Bowtie 1 (default) | PAIRED-END | FASTQ | |
6679 | |
6680 sub paired_end_align_fragments_to_bisulfite_genome_fastQ { | |
6681 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
6682 | |
6683 if ($directional){ | |
6684 warn "Input file is $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; | |
6685 } | |
6686 elsif($pbat){ | |
6687 warn "Input file is $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ; PBAT-Seq)\n"; | |
6688 } | |
6689 else{ | |
6690 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 and $G_to_A_infile_1 and $C_to_T_infile_2 (non-directional; FastQ)\n"; | |
6691 } | |
6692 | |
6693 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the data structure above | |
6694 if ($directional or $pbat){ | |
6695 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6696 } | |
6697 else{ | |
6698 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6699 } | |
6700 | |
6701 foreach my $fh (@fhs) { | |
6702 | |
6703 if ($directional or $pbat){ | |
6704 unless ($fh->{inputfile_1}){ | |
6705 $fh->{last_seq_id} = undef; | |
6706 $fh->{last_line_1} = undef; | |
6707 $fh->{last_line_2} = undef; | |
6708 next; # skipping unwanted filehandles | |
6709 } | |
6710 } | |
6711 | |
6712 my $bt_options = $bowtie_options; | |
6713 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
6714 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
6715 } | |
6716 else { | |
6717 $bt_options .= ' --nofw'; | |
6718 } | |
6719 | |
6720 if ($gzip){ | |
6721 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n"; | |
6722 open ($fh->{fh},"zcat ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!"; | |
6723 } | |
6724 else{ | |
6725 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n"; | |
6726 sleep(5); | |
6727 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
6728 } | |
6729 | |
6730 my $line_1 = $fh->{fh}->getline(); | |
6731 my $line_2 = $fh->{fh}->getline(); | |
6732 | |
6733 # if Bowtie produces an alignment we store the first line of the output | |
6734 if ($line_1 and $line_2) { | |
6735 chomp $line_1; | |
6736 chomp $line_2; | |
6737 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
6738 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
6739 | |
6740 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
6741 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
6742 | |
6743 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present | |
6744 $fh->{last_seq_id} = $id_1; | |
6745 } | |
6746 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present | |
6747 $fh->{last_seq_id} = $id_2; | |
6748 } | |
6749 else{ | |
6750 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
6751 } | |
6752 | |
6753 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 | |
6754 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 | |
6755 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
6756 } | |
6757 | |
6758 # otherwise we just initialise last_seq_id and last_lines as undefined | |
6759 else { | |
6760 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
6761 $fh->{last_seq_id} = undef; | |
6762 $fh->{last_line_1} = undef; | |
6763 $fh->{last_line_2} = undef; | |
6764 } | |
6765 } | |
6766 } | |
6767 | |
6768 ### Bowtie 2 | PAIRED-END | FASTQ | |
6769 | |
6770 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { | |
6771 | |
6772 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
6773 if ($directional){ | |
6774 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; | |
6775 } | |
6776 elsif ($pbat){ | |
6777 warn "Input files are $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ)\n"; | |
6778 } | |
6779 else{ | |
6780 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; | |
6781 } | |
6782 | |
6783 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
6784 ## data structure above | |
6785 if ($directional or $pbat){ | |
6786 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6787 } | |
6788 else{ | |
6789 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6790 } | |
6791 | |
6792 foreach my $fh (@fhs) { | |
6793 | |
6794 if ($directional or $pbat){ # skipping unwanted filehandles | |
6795 unless ($fh->{inputfile_1}){ | |
6796 $fh->{last_seq_id} = undef; | |
6797 $fh->{last_line_1} = undef; | |
6798 $fh->{last_line_2} = undef; | |
6799 next; | |
6800 } | |
6801 } | |
6802 | |
6803 my $bt2_options = $bowtie_options; | |
6804 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
6805 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
6806 } | |
6807 else { | |
6808 $bt2_options .= ' --nofw'; | |
6809 } | |
6810 | |
6811 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; | |
6812 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
6813 | |
6814 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
6815 while (1){ | |
6816 $_ = $fh->{fh}->getline(); | |
6817 if ($_) { | |
6818 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
6819 } | |
6820 else{ | |
6821 last; # no alignment output | |
6822 } | |
6823 } | |
6824 | |
6825 my $line_1 = $_; | |
6826 my $line_2 = $fh->{fh}->getline(); | |
6827 | |
6828 # if Bowtie produces an alignment we store the first line of the output | |
6829 if ($line_1 and $line_2) { | |
6830 chomp $line_1; | |
6831 chomp $line_2; | |
6832 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
6833 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
6834 | |
6835 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
6836 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
6837 | |
6838 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with | |
6839 $fh->{last_seq_id} = $id_1; | |
6840 } | |
6841 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present | |
6842 $fh->{last_seq_id} = $id_2; | |
6843 } | |
6844 else{ | |
6845 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
6846 } | |
6847 | |
6848 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 | |
6849 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 | |
6850 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
6851 } | |
6852 | |
6853 # otherwise we just initialise last_seq_id and last_lines as undefined | |
6854 else { | |
6855 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
6856 $fh->{last_seq_id} = undef; | |
6857 $fh->{last_line_1} = undef; | |
6858 $fh->{last_line_2} = undef; | |
6859 } | |
6860 } | |
6861 } | |
6862 | |
6863 ##################################################################################################################################################### | |
6864 | |
6865 ### Bowtie 1 (default) | SINGLE-END | FASTA | |
6866 sub single_end_align_fragments_to_bisulfite_genome_fastA { | |
6867 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
6868 if ($directional){ | |
6869 warn "Input file is $C_to_T_infile (FastA)\n"; | |
6870 } | |
6871 else{ | |
6872 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; | |
6873 } | |
6874 | |
6875 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
6876 ## data structure above | |
6877 if ($directional){ | |
6878 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6879 } | |
6880 else{ | |
6881 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6882 } | |
6883 | |
6884 foreach my $fh (@fhs) { | |
6885 | |
6886 my $bt_options = $bowtie_options; | |
6887 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
6888 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
6889 } | |
6890 else { | |
6891 $bt_options .= ' --nofw'; | |
6892 } | |
6893 | |
6894 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; | |
6895 if ($gzip){ | |
6896 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; | |
6897 } | |
6898 else{ | |
6899 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data | |
6900 } | |
6901 | |
6902 # if Bowtie produces an alignment we store the first line of the output | |
6903 $_ = $fh->{fh}->getline(); | |
6904 if ($_) { | |
6905 chomp; | |
6906 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier) | |
6907 $fh->{last_seq_id} = $id; | |
6908 $fh->{last_line} = $_; | |
6909 warn "Found first alignment:\t$fh->{last_line}\n"; | |
6910 } | |
6911 # otherwise we just initialise last_seq_id and last_line as undefined | |
6912 else { | |
6913 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
6914 $fh->{last_seq_id} = undef; | |
6915 $fh->{last_line} = undef; | |
6916 } | |
6917 } | |
6918 } | |
6919 | |
6920 ### Bowtie 2 | SINGLE-END | FASTA | |
6921 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { | |
6922 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
6923 if ($directional){ | |
6924 warn "Input file is $C_to_T_infile (FastA)\n"; | |
6925 } | |
6926 else{ | |
6927 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; | |
6928 } | |
6929 | |
6930 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
6931 ## data structure above | |
6932 if ($directional){ | |
6933 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6934 } | |
6935 else{ | |
6936 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6937 } | |
6938 | |
6939 foreach my $fh (@fhs) { | |
6940 | |
6941 my $bt2_options = $bowtie_options; | |
6942 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
6943 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
6944 } | |
6945 else { | |
6946 $bt2_options .= ' --nofw'; | |
6947 } | |
6948 | |
6949 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n"; | |
6950 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie 2: $!"; | |
6951 | |
6952 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
6953 while (1){ | |
6954 $_ = $fh->{fh}->getline(); | |
6955 if ($_) { | |
6956 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
6957 } | |
6958 else{ | |
6959 last; # no alignment output | |
6960 } | |
6961 } | |
6962 | |
6963 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output | |
6964 if ($_) { | |
6965 chomp; | |
6966 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) | |
6967 $fh->{last_seq_id} = $id; | |
6968 $fh->{last_line} = $_; | |
6969 warn "Found first alignment:\t$fh->{last_line}\n"; | |
6970 } | |
6971 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output | |
6972 else { | |
6973 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
6974 $fh->{last_seq_id} = undef; | |
6975 $fh->{last_line} = undef; | |
6976 } | |
6977 } | |
6978 } | |
6979 | |
6980 | |
6981 ### Bowtie 1 (default) | SINGLE-END | FASTQ | |
6982 sub single_end_align_fragments_to_bisulfite_genome_fastQ { | |
6983 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
6984 if ($directional){ | |
6985 warn "Input file is $C_to_T_infile (FastQ)\n"; | |
6986 } | |
6987 elsif($pbat){ | |
6988 warn "Input file is $G_to_A_infile (FastQ)\n"; | |
6989 } | |
6990 else{ | |
6991 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n"; | |
6992 } | |
6993 | |
6994 | |
6995 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
6996 ## the data structure above | |
6997 if ($directional or $pbat){ | |
6998 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6999 } | |
7000 else{ | |
7001 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7002 } | |
7003 | |
7004 foreach my $fh (@fhs) { | |
7005 my $bt_options = $bowtie_options; | |
7006 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
7007 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
7008 } | |
7009 else { | |
7010 $bt_options .= ' --nofw'; | |
7011 } | |
7012 | |
7013 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; | |
7014 sleep (5); | |
7015 | |
7016 if ($gzip){ | |
7017 open ($fh->{fh},"zcat $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; | |
7018 } | |
7019 else{ | |
7020 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data | |
7021 } | |
7022 | |
7023 # if Bowtie produces an alignment we store the first line of the output | |
7024 $_ = $fh->{fh}->getline(); | |
7025 if ($_) { | |
7026 chomp; | |
7027 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) | |
7028 $fh->{last_seq_id} = $id; | |
7029 $fh->{last_line} = $_; | |
7030 warn "Found first alignment:\t$fh->{last_line}\n"; | |
7031 } | |
7032 # otherwise we just initialise last_seq_id and last_line as undefined | |
7033 else { | |
7034 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
7035 $fh->{last_seq_id} = undef; | |
7036 $fh->{last_line} = undef; | |
7037 } | |
7038 } | |
7039 } | |
7040 | |
7041 ### Bowtie 2 | SINGLE-END | FASTQ | |
7042 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { | |
7043 | |
7044 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
7045 if ($directional){ | |
7046 warn "Input file is $C_to_T_infile (FastQ)\n\n"; | |
7047 } | |
7048 elsif ($pbat){ | |
7049 warn "Input file is $G_to_A_infile (FastQ)\n\n"; | |
7050 } | |
7051 else{ | |
7052 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n"; | |
7053 } | |
7054 | |
7055 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
7056 ## the data structure above | |
7057 if ($directional or $pbat){ | |
7058 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7059 } | |
7060 else{ | |
7061 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7062 } | |
7063 | |
7064 foreach my $fh (@fhs) { | |
7065 my $bt2_options = $bowtie_options; | |
7066 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
7067 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
7068 } | |
7069 else { | |
7070 $bt2_options .= ' --nofw'; | |
7071 } | |
7072 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n"; | |
7073 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n"; | |
7074 | |
7075 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; | |
7076 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
7077 while (1){ | |
7078 $_ = $fh->{fh}->getline(); | |
7079 # warn "$_\n"; | |
7080 # sleep(1); | |
7081 if ($_) { | |
7082 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
7083 } | |
7084 else { | |
7085 last; | |
7086 } | |
7087 } | |
7088 | |
7089 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output | |
7090 if ($_) { | |
7091 chomp; | |
7092 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier) | |
7093 $fh->{last_seq_id} = $id; | |
7094 $fh->{last_line} = $_; | |
7095 warn "Found first alignment:\t$fh->{last_line}\n"; | |
7096 # warn "storing $id and\n$_\n"; | |
7097 } | |
7098 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output | |
7099 else { | |
7100 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
7101 $fh->{last_seq_id} = undef; | |
7102 $fh->{last_line} = undef; | |
7103 } | |
7104 } | |
7105 } | |
7106 | |
7107 ########################################################################################################################################### | |
7108 | |
7109 sub reset_counters_and_fhs{ | |
7110 my $filename = shift; | |
7111 %counting=( | |
7112 total_meCHH_count => 0, | |
7113 total_meCHG_count => 0, | |
7114 total_meCpG_count => 0, | |
7115 total_meC_unknown_count => 0, | |
7116 total_unmethylated_CHH_count => 0, | |
7117 total_unmethylated_CHG_count => 0, | |
7118 total_unmethylated_CpG_count => 0, | |
7119 total_unmethylated_C_unknown_count => 0, | |
7120 sequences_count => 0, | |
7121 no_single_alignment_found => 0, | |
7122 unsuitable_sequence_count => 0, | |
7123 genomic_sequence_could_not_be_extracted_count => 0, | |
7124 unique_best_alignment_count => 0, | |
7125 low_complexity_alignments_overruled_count => 0, | |
7126 CT_CT_count => 0, #(CT read/CT genome, original top strand) | |
7127 CT_GA_count => 0, #(CT read/GA genome, original bottom strand) | |
7128 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand) | |
7129 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand) | |
7130 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand) | |
7131 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand) | |
7132 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand) | |
7133 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand) | |
7134 alignments_rejected_count => 0, # only relevant if --directional was specified | |
7135 ); | |
7136 | |
7137 if ($directional){ | |
7138 if ($filename =~ ','){ # paired-end files | |
7139 @fhs=( | |
7140 { name => 'CTreadCTgenome', | |
7141 strand_identity => 'con ori forward', | |
7142 bisulfiteIndex => $CT_index_basename, | |
7143 seen => 0, | |
7144 wrong_strand => 0, | |
7145 }, | |
7146 { name => 'CTreadGAgenome', | |
7147 strand_identity => 'con ori reverse', | |
7148 bisulfiteIndex => $GA_index_basename, | |
7149 seen => 0, | |
7150 wrong_strand => 0, | |
7151 }, | |
7152 { name => 'GAreadCTgenome', | |
7153 strand_identity => 'compl ori con forward', | |
7154 bisulfiteIndex => $CT_index_basename, | |
7155 seen => 0, | |
7156 wrong_strand => 0, | |
7157 }, | |
7158 { name => 'GAreadGAgenome', | |
7159 strand_identity => 'compl ori con reverse', | |
7160 bisulfiteIndex => $GA_index_basename, | |
7161 seen => 0, | |
7162 wrong_strand => 0, | |
7163 }, | |
7164 ); | |
7165 } | |
7166 else{ # single-end files | |
7167 @fhs=( | |
7168 { name => 'CTreadCTgenome', | |
7169 strand_identity => 'con ori forward', | |
7170 bisulfiteIndex => $CT_index_basename, | |
7171 seen => 0, | |
7172 wrong_strand => 0, | |
7173 }, | |
7174 { name => 'CTreadGAgenome', | |
7175 strand_identity => 'con ori reverse', | |
7176 bisulfiteIndex => $GA_index_basename, | |
7177 seen => 0, | |
7178 wrong_strand => 0, | |
7179 }, | |
7180 ); | |
7181 } | |
7182 } | |
7183 elsif($pbat){ | |
7184 if ($filename =~ ','){ # paired-end files | |
7185 @fhs=( | |
7186 { name => 'CTreadCTgenome', | |
7187 strand_identity => 'con ori forward', | |
7188 bisulfiteIndex => $CT_index_basename, | |
7189 seen => 0, | |
7190 wrong_strand => 0, | |
7191 }, | |
7192 { name => 'CTreadGAgenome', | |
7193 strand_identity => 'con ori reverse', | |
7194 bisulfiteIndex => $GA_index_basename, | |
7195 seen => 0, | |
7196 wrong_strand => 0, | |
7197 }, | |
7198 { name => 'GAreadCTgenome', | |
7199 strand_identity => 'compl ori con forward', | |
7200 bisulfiteIndex => $CT_index_basename, | |
7201 seen => 0, | |
7202 wrong_strand => 0, | |
7203 }, | |
7204 { name => 'GAreadGAgenome', | |
7205 strand_identity => 'compl ori con reverse', | |
7206 bisulfiteIndex => $GA_index_basename, | |
7207 seen => 0, | |
7208 wrong_strand => 0, | |
7209 }, | |
7210 ); | |
7211 } | |
7212 else{ # single-end files | |
7213 @fhs=( | |
7214 { name => 'GAreadCTgenome', | |
7215 strand_identity => 'compl ori con forward', | |
7216 bisulfiteIndex => $CT_index_basename, | |
7217 seen => 0, | |
7218 wrong_strand => 0, | |
7219 }, | |
7220 { name => 'GAreadGAgenome', | |
7221 strand_identity => 'compl ori con reverse', | |
7222 bisulfiteIndex => $GA_index_basename, | |
7223 seen => 0, | |
7224 wrong_strand => 0, | |
7225 }, | |
7226 ); | |
7227 } | |
7228 } | |
7229 else{ | |
7230 @fhs=( | |
7231 { name => 'CTreadCTgenome', | |
7232 strand_identity => 'con ori forward', | |
7233 bisulfiteIndex => $CT_index_basename, | |
7234 seen => 0, | |
7235 wrong_strand => 0, | |
7236 }, | |
7237 { name => 'CTreadGAgenome', | |
7238 strand_identity => 'con ori reverse', | |
7239 bisulfiteIndex => $GA_index_basename, | |
7240 seen => 0, | |
7241 wrong_strand => 0, | |
7242 }, | |
7243 { name => 'GAreadCTgenome', | |
7244 strand_identity => 'compl ori con forward', | |
7245 bisulfiteIndex => $CT_index_basename, | |
7246 seen => 0, | |
7247 wrong_strand => 0, | |
7248 }, | |
7249 { name => 'GAreadGAgenome', | |
7250 strand_identity => 'compl ori con reverse', | |
7251 bisulfiteIndex => $GA_index_basename, | |
7252 seen => 0, | |
7253 wrong_strand => 0, | |
7254 }, | |
7255 ); | |
7256 } | |
7257 } | |
7258 | |
7259 | |
7260 sub process_command_line{ | |
7261 my @bowtie_options; | |
7262 my $help; | |
7263 my $mates1; | |
7264 my $mates2; | |
7265 my $path_to_bowtie; | |
7266 my $fastq; | |
7267 my $fasta; | |
7268 my $skip; | |
7269 my $qupto; | |
7270 my $phred64; | |
7271 my $phred33; | |
7272 my $solexa; | |
7273 my $mismatches; | |
7274 my $seed_length; | |
7275 my $best; | |
7276 my $sequence_format; | |
7277 my $version; | |
7278 my $quiet; | |
7279 my $chunk; | |
7280 my $non_directional; | |
7281 my $ceiling; | |
7282 my $maxins; | |
7283 my $minins; | |
7284 my $unmapped; | |
7285 my $multi_map; | |
7286 my $output_dir; | |
7287 my $bowtie2; | |
7288 my $vanilla; | |
7289 my $sam_no_hd; | |
7290 my $seed_extension_fails; | |
7291 my $reseed_repetitive_seeds; | |
7292 my $most_valid_alignments; | |
7293 my $score_min; | |
7294 my $parallel; | |
7295 my $temp_dir; | |
7296 my $rdg; | |
7297 my $rfg; | |
7298 my $non_bs_mm; | |
7299 my $samtools_path; | |
7300 my $bam; | |
7301 my $gzip; | |
7302 my $pbat; | |
7303 my $prefix; | |
7304 my $old_flag; | |
7305 my $basename; | |
7306 my $sam; | |
7307 my $multicore; | |
7308 | |
7309 my $command_line = GetOptions ('help|man' => \$help, | |
7310 '1=s' => \$mates1, | |
7311 '2=s' => \$mates2, | |
7312 'path_to_bowtie=s' => \$path_to_bowtie, | |
7313 'f|fasta' => \$fasta, | |
7314 'q|fastq' => \$fastq, | |
7315 's|skip=i' => \$skip, | |
7316 'u|upto=i' => \$qupto, | |
7317 'phred33-quals' => \$phred33, | |
7318 'phred64-quals|solexa1' => \$phred64, | |
7319 'solexa-quals' => \$solexa, | |
7320 'n|seedmms=i' => \$mismatches, | |
7321 'l|seedlen=i' => \$seed_length, | |
7322 'no_best' => \$best, | |
7323 'version' => \$version, | |
7324 'quiet' => \$quiet, | |
7325 'chunkmbs=i' => \$chunk, | |
7326 'non_directional' => \$non_directional, | |
7327 'I|minins=i' => \$minins, | |
7328 'X|maxins=i' => \$maxins, | |
7329 'e|maqerr=i' => \$ceiling, | |
7330 'un|unmapped' => \$unmapped, | |
7331 'ambiguous' => \$multi_map, | |
7332 'o|output_dir=s' => \$output_dir, | |
7333 'bowtie2' => \$bowtie2, | |
7334 'vanilla' => \$vanilla, | |
7335 'sam-no-hd' => \$sam_no_hd, | |
7336 'D=i' => \$seed_extension_fails, | |
7337 'R=i' => \$reseed_repetitive_seeds, | |
7338 'score_min=s' => \$score_min, | |
7339 'most_valid_alignments=i' => \$most_valid_alignments, | |
7340 'p=i' => \$parallel, | |
7341 'temp_dir=s' => \$temp_dir, | |
7342 'rdg=s' => \$rdg, | |
7343 'rfg=s' => \$rfg, | |
7344 'non_bs_mm' => \$non_bs_mm, | |
7345 'samtools_path=s' => \$samtools_path, | |
7346 'bam' => \$bam, | |
7347 'gzip' => \$gzip, | |
7348 'pbat' => \$pbat, | |
7349 'prefix=s' => \$prefix, | |
7350 'old_flag' => \$old_flag, | |
7351 'B|basename=s' => \$basename, | |
7352 'sam' => \$sam, | |
7353 'multicore=i' => \$multicore, | |
7354 ); | |
7355 | |
7356 | |
7357 ### EXIT ON ERROR if there were errors with any of the supplied options | |
7358 unless ($command_line){ | |
7359 die "Please respecify command line options\n"; | |
7360 } | |
7361 ### HELPFILE | |
7362 if ($help){ | |
7363 print_helpfile(); | |
7364 exit; | |
7365 } | |
7366 if ($version){ | |
7367 print << "VERSION"; | |
7368 | |
7369 | |
7370 Bismark - Bisulfite Mapper and Methylation Caller. | |
7371 | |
7372 Bismark Version: $bismark_version | |
7373 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics | |
7374 www.bioinformatics.babraham.ac.uk/projects/ | |
7375 | |
7376 | |
7377 VERSION | |
7378 exit; | |
7379 } | |
7380 | |
7381 | |
7382 ########################## | |
7383 ### PROCESSING OPTIONS ### | |
7384 ########################## | |
7385 | |
7386 unless ($bowtie2){ | |
7387 $bowtie2 = 0; | |
7388 } | |
7389 unless ($sam_no_hd){ | |
7390 $sam_no_hd =0; | |
7391 } | |
7392 | |
7393 ### PATH TO BOWTIE | |
7394 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH | |
7395 if ($path_to_bowtie){ | |
7396 unless ($path_to_bowtie =~ /\/$/){ | |
7397 $path_to_bowtie =~ s/$/\//; | |
7398 } | |
7399 if (-d $path_to_bowtie){ | |
7400 if ($bowtie2){ | |
7401 $path_to_bowtie = "${path_to_bowtie}bowtie2"; | |
7402 } | |
7403 else{ | |
7404 $path_to_bowtie = "${path_to_bowtie}bowtie"; | |
7405 } | |
7406 } | |
7407 else{ | |
7408 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n"; | |
7409 } | |
7410 } | |
7411 else{ | |
7412 if ($bowtie2){ | |
7413 $path_to_bowtie = 'bowtie2'; | |
7414 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; } | |
7415 else{ | |
7416 $path_to_bowtie = 'bowtie'; | |
7417 warn "Path to Bowtie specified as: $path_to_bowtie\n"; | |
7418 } | |
7419 } | |
7420 | |
7421 | |
7422 if ($sam){ | |
7423 warn "Output format manually set as SAM\n"; | |
7424 } | |
7425 else{ | |
7426 $bam = 1; | |
7427 warn "Output format is BAM (default)\n"; | |
7428 } | |
7429 | |
7430 ### OUTPUT REQUESTED AS BAM FILE (default) | |
7431 if ($bam){ | |
7432 if ($vanilla){ | |
7433 die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n"; | |
7434 } | |
7435 | |
7436 ### PATH TO SAMTOOLS | |
7437 if (defined $samtools_path){ | |
7438 # if Samtools was specified as full command | |
7439 if ($samtools_path =~ /samtools$/){ | |
7440 if (-e $samtools_path){ | |
7441 # Samtools executable found | |
7442 } | |
7443 else{ | |
7444 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
7445 } | |
7446 } | |
7447 else{ | |
7448 unless ($samtools_path =~ /\/$/){ | |
7449 $samtools_path =~ s/$/\//; | |
7450 } | |
7451 $samtools_path .= 'samtools'; | |
7452 if (-e $samtools_path){ | |
7453 # Samtools executable found | |
7454 } | |
7455 else{ | |
7456 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
7457 } | |
7458 } | |
7459 | |
7460 warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n"; | |
7461 $bam = 1; | |
7462 } | |
7463 # Check whether Samtools is in the PATH if no path was supplied by the user | |
7464 else{ | |
7465 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH | |
7466 $samtools_path = `which samtools`; | |
7467 chomp $samtools_path; | |
7468 warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n"; | |
7469 $bam = 1; | |
7470 } | |
7471 } | |
7472 | |
7473 unless (defined $samtools_path){ | |
7474 $bam = 2; | |
7475 warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n"; | |
7476 } | |
7477 sleep (1); | |
7478 } | |
7479 | |
7480 | |
7481 #################################### | |
7482 ### PROCESSING ARGUMENTS | |
7483 | |
7484 ### GENOME FOLDER | |
7485 my $genome_folder = shift @ARGV; # mandatory | |
7486 unless ($genome_folder){ | |
7487 warn "Genome folder was not specified!\n"; | |
7488 print_helpfile(); | |
7489 exit; | |
7490 } | |
7491 | |
7492 ### checking that the genome folder, all subfolders and the required bowtie index files exist | |
7493 unless ($genome_folder =~/\/$/){ | |
7494 $genome_folder =~ s/$/\//; | |
7495 } | |
7496 | |
7497 if (chdir $genome_folder){ | |
7498 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute | |
7499 unless ($absolute_genome_folder =~/\/$/){ | |
7500 $absolute_genome_folder =~ s/$/\//; | |
7501 } | |
7502 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n"; | |
7503 $genome_folder = $absolute_genome_folder; | |
7504 } | |
7505 else{ | |
7506 die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n"; | |
7507 } | |
7508 | |
7509 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/"; | |
7510 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/"; | |
7511 | |
7512 my $bt2_small_index_present = 1; | |
7513 my $bt2_large_index_present = 1; | |
7514 | |
7515 if ($bowtie2){ ### Bowtie 2 | |
7516 | |
7517 ### Checking for small indixes first (ending in .bt2) | |
7518 | |
7519 # checking the integrity of $CT_dir | |
7520 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | |
7521 | |
7522 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); | |
7523 foreach my $file(@CT_bowtie_index){ | |
7524 unless (-f $file){ | |
7525 warn "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n"; | |
7526 $bt2_small_index_present = 0; | |
7527 } | |
7528 } | |
7529 # checking the integrity of $GA_dir | |
7530 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | |
7531 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); | |
7532 | |
7533 foreach my $file(@GA_bowtie_index){ | |
7534 unless (-f $file){ | |
7535 warn "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n"; | |
7536 $bt2_small_index_present = 0; | |
7537 } | |
7538 } | |
7539 | |
7540 ### Using the small index preferentially | |
7541 if ($bt2_small_index_present){ | |
7542 $bt2_large_index_present = 0; | |
7543 } | |
7544 else{ # only checking for large indexes if the 'normal' one can't be found | |
7545 warn "\nCouldn't find a traditional small Bowtie 2 index for the genome specified (ending in .bt2). Now searching for a large index instead (64-bit index ending in .bt2l)...\n"; | |
7546 | |
7547 ### If no small small indexes were found we look for large indexes (64-bit indexes, ending in .bt2l) | |
7548 | |
7549 # checking the integrity of $CT_dir | |
7550 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | |
7551 | |
7552 @CT_bowtie_index = ('BS_CT.1.bt2l','BS_CT.2.bt2l','BS_CT.3.bt2l','BS_CT.4.bt2l','BS_CT.rev.1.bt2l','BS_CT.rev.2.bt2l'); | |
7553 foreach my $file(@CT_bowtie_index){ | |
7554 unless (-f $file){ | |
7555 die "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n"; | |
7556 $bt2_large_index_present = 0; } | |
7557 } | |
7558 | |
7559 ### checking the integrity of $GA_dir | |
7560 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | |
7561 @GA_bowtie_index = ('BS_GA.1.bt2l','BS_GA.2.bt2l','BS_GA.3.bt2l','BS_GA.4.bt2l','BS_GA.rev.1.bt2l','BS_GA.rev.2.bt2l'); | |
7562 | |
7563 foreach my $file(@GA_bowtie_index){ | |
7564 unless (-f $file){ | |
7565 die "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n"; | |
7566 $bt2_large_index_present = 0; | |
7567 } | |
7568 } | |
7569 | |
7570 if ($bt2_large_index_present){ | |
7571 warn "64-bit large genome Bowtie 2 index found...\n"; | |
7572 } | |
7573 else{ | |
7574 die "Failed to detect either a standard (.bt2) or 64-bit (.bt2l) Bowtie 2 index for the genome specified. Please run the bismark_genome_preparation before launching Bismark\n\n"; | |
7575 } | |
7576 } | |
7577 | |
7578 } | |
7579 | |
7580 else{ ### Bowtie 1 (default) | |
7581 ### checking the integrity of $CT_dir | |
7582 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | |
7583 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); | |
7584 foreach my $file(@CT_bowtie_index){ | |
7585 unless (-f $file){ | |
7586 die "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; | |
7587 } | |
7588 } | |
7589 ### checking the integrity of $GA_dir | |
7590 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | |
7591 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); | |
7592 foreach my $file(@GA_bowtie_index){ | |
7593 unless (-f $file){ | |
7594 die "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation before running Bismark.\n"; | |
7595 } | |
7596 } | |
7597 } | |
7598 | |
7599 my $CT_index_basename = "${CT_dir}BS_CT"; | |
7600 my $GA_index_basename = "${GA_dir}BS_GA"; | |
7601 | |
7602 ### INPUT OPTIONS | |
7603 | |
7604 ### SEQUENCE FILE FORMAT | |
7605 ### exits if both fastA and FastQ were specified | |
7606 if ($fasta and $fastq){ | |
7607 die "Only one sequence filetype can be specified (fastA or fastQ)\n"; | |
7608 } | |
7609 | |
7610 ### unless fastA is specified explicitely, fastQ sequence format is expected by default | |
7611 if ($fasta){ | |
7612 print "FastA format specified\n"; | |
7613 $sequence_format = 'FASTA'; | |
7614 push @bowtie_options, '-f'; | |
7615 } | |
7616 elsif ($fastq){ | |
7617 print "FastQ format specified\n"; | |
7618 $sequence_format = 'FASTQ'; | |
7619 push @bowtie_options, '-q'; | |
7620 } | |
7621 else{ | |
7622 $fastq = 1; | |
7623 print "FastQ format assumed (by default)\n"; | |
7624 $sequence_format = 'FASTQ'; | |
7625 push @bowtie_options, '-q'; | |
7626 } | |
7627 | |
7628 ### SKIP | |
7629 if ($skip){ | |
7630 warn "Skipping the first $skip reads from the input file\n"; | |
7631 # push @bowtie_options,"-s $skip"; | |
7632 } | |
7633 | |
7634 ### UPTO | |
7635 if ($qupto){ | |
7636 warn "Processing sequences up to read no. $qupto from the input file\n"; | |
7637 if ($bowtie2){ | |
7638 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2 | |
7639 } | |
7640 else{ | |
7641 # push @bowtie_options,"--qupto $qupto"; | |
7642 } | |
7643 } | |
7644 | |
7645 ### QUALITY VALUES | |
7646 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){ | |
7647 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)"; | |
7648 } | |
7649 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2. | |
7650 # Phred quality values work only when -q is specified | |
7651 unless ($fastq){ | |
7652 die "Phred quality values works only when -q (FASTQ) is specified\n"; | |
7653 } | |
7654 if ($bowtie2){ | |
7655 push @bowtie_options,"--phred33"; | |
7656 } | |
7657 else{ | |
7658 push @bowtie_options,"--phred33-quals"; | |
7659 } | |
7660 } | |
7661 if ($phred64){ | |
7662 # Phred quality values work only when -q is specified | |
7663 unless ($fastq){ | |
7664 die "Phred quality values work only when -q (FASTQ) is specified\n"; | |
7665 } | |
7666 if ($bowtie2){ | |
7667 push @bowtie_options,"--phred64"; | |
7668 } | |
7669 else{ | |
7670 push @bowtie_options,"--phred64-quals"; | |
7671 } | |
7672 } | |
7673 else{ | |
7674 $phred64 = 0; | |
7675 } | |
7676 | |
7677 if ($solexa){ | |
7678 if ($bowtie2){ | |
7679 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n"; | |
7680 } | |
7681 # Solexa to Phred value conversion works only when -q is specified | |
7682 unless ($fastq){ | |
7683 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n"; | |
7684 } | |
7685 push @bowtie_options,"--solexa-quals"; | |
7686 } | |
7687 else{ | |
7688 $solexa = 0; | |
7689 } | |
7690 | |
7691 ### ALIGNMENT OPTIONS | |
7692 | |
7693 ### MISMATCHES | |
7694 if (defined $mismatches){ | |
7695 if ($bowtie2){ | |
7696 if ($mismatches == 0 or $mismatches == 1){ | |
7697 push @bowtie_options,"-N $mismatches"; | |
7698 } | |
7699 else{ | |
7700 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n"; | |
7701 } | |
7702 } | |
7703 else{ | |
7704 if ($mismatches >= 0 and $mismatches <= 3){ | |
7705 push @bowtie_options,"-n $mismatches"; | |
7706 } | |
7707 else{ | |
7708 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n"; | |
7709 } | |
7710 } | |
7711 } | |
7712 else{ | |
7713 unless ($bowtie2){ | |
7714 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2 | |
7715 } | |
7716 } | |
7717 | |
7718 ### SEED LENGTH | |
7719 if (defined $seed_length){ | |
7720 if ($bowtie2){ | |
7721 push @bowtie_options,"-L $seed_length"; | |
7722 } | |
7723 else{ | |
7724 push @bowtie_options,"-l $seed_length"; | |
7725 } | |
7726 } | |
7727 | |
7728 ### MISMATCH CEILING | |
7729 if (defined $ceiling){ | |
7730 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2); | |
7731 push @bowtie_options,"-e $ceiling"; | |
7732 } | |
7733 | |
7734 | |
7735 ### BOWTIE 2 EFFORT OPTIONS | |
7736 | |
7737 ### CONSECUTIVE SEED EXTENSION FAILS | |
7738 if (defined $seed_extension_fails){ | |
7739 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
7740 push @bowtie_options,"-D $seed_extension_fails"; | |
7741 } | |
7742 | |
7743 ### RE-SEEDING REPETITIVE SEEDS | |
7744 if (defined $reseed_repetitive_seeds){ | |
7745 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
7746 push @bowtie_options,"-R $reseed_repetitive_seeds"; | |
7747 } | |
7748 | |
7749 | |
7750 ### BOWTIE 2 SCORING OPTIONS | |
7751 | |
7752 my ($score_min_intercept, $score_min_slope); | |
7753 | |
7754 if ($score_min){ | |
7755 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
7756 | |
7757 unless ($score_min =~ /^L,(.+),(.+)$/){ | |
7758 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; | |
7759 } | |
7760 ($score_min_intercept, $score_min_slope) = ($1, $2); | |
7761 push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2 | |
7762 } | |
7763 else{ | |
7764 if ($bowtie2){ | |
7765 ($score_min_intercept, $score_min_slope) = (0, -0.2); | |
7766 push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2 | |
7767 } | |
7768 } | |
7769 | |
7770 ### BOWTIE 2 READ GAP OPTIONS | |
7771 my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend); | |
7772 | |
7773 if ($rdg){ | |
7774 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
7775 if ($rdg =~ /^(\d+),(\d+)$/){ | |
7776 $deletion_open = $1; | |
7777 $deletion_extend = $2; | |
7778 } | |
7779 else{ | |
7780 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; | |
7781 } | |
7782 push @bowtie_options,"--rdg $rdg"; | |
7783 } | |
7784 else{ | |
7785 $deletion_open = 5; | |
7786 $deletion_extend = 3; | |
7787 } | |
7788 | |
7789 ### BOWTIE 2 REFERENCE GAP OPTIONS | |
7790 if ($rfg){ | |
7791 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
7792 if ($rfg =~ /^(\d+),(\d+)$/){ | |
7793 $insertion_open = $1; | |
7794 $insertion_extend = $2; | |
7795 } | |
7796 else{ | |
7797 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; | |
7798 } | |
7799 push @bowtie_options,"--rfg $rfg"; | |
7800 } | |
7801 else{ | |
7802 $insertion_open = 5; | |
7803 $insertion_extend = 3; | |
7804 } | |
7805 | |
7806 | |
7807 ### BOWTIE 2 PARALLELIZATION OPTIONS | |
7808 if (defined $parallel){ | |
7809 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2); | |
7810 } | |
7811 if ($bowtie2){ | |
7812 if ($parallel){ | |
7813 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); | |
7814 if ($parallel > 4){ | |
7815 warn "Attention: using more than 4 cores per alignment thread has been reported to have diminishing returns. If possible try to limit -p to a value of 4\n"; sleep(2); | |
7816 } | |
7817 push @bowtie_options,"-p $parallel"; | |
7818 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work. | |
7819 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n"; | |
7820 sleep (2); | |
7821 } | |
7822 } | |
7823 | |
7824 ### REPORTING OPTIONS | |
7825 | |
7826 if ($bowtie2){ | |
7827 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default | |
7828 | |
7829 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while | |
7830 if(defined $most_valid_alignments){ | |
7831 | |
7832 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n"; | |
7833 } | |
7834 } | |
7835 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1 | |
7836 push @bowtie_options,'-k 2'; | |
7837 } | |
7838 | |
7839 ### --BEST | |
7840 if ($bowtie2){ | |
7841 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used | |
7842 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n"; | |
7843 } | |
7844 } | |
7845 else{ | |
7846 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process) | |
7847 unless ($best){ | |
7848 push @bowtie_options,'--best'; | |
7849 } | |
7850 } | |
7851 | |
7852 ### VANILLA BISMARK (BOWTIE 1) OUTPUT | |
7853 if ($vanilla){ | |
7854 if ($bowtie2){ | |
7855 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n"; | |
7856 } | |
7857 } | |
7858 else{ | |
7859 $vanilla = 0; | |
7860 } | |
7861 | |
7862 ### PAIRED-END MAPPING | |
7863 if ($mates1){ | |
7864 my @mates1 = (split (/,/,$mates1)); | |
7865 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2); | |
7866 my @mates2 = (split(/,/,$mates2)); | |
7867 unless (scalar @mates1 == scalar @mates2){ | |
7868 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n"; | |
7869 } | |
7870 while (1){ | |
7871 my $mate1 = shift @mates1; | |
7872 my $mate2 = shift @mates2; | |
7873 last unless ($mate1 and $mate2); | |
7874 push @filenames,"$mate1,$mate2"; | |
7875 } | |
7876 if ($bowtie2){ | |
7877 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments | |
7878 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones | |
7879 } | |
7880 | |
7881 if ($old_flag){ | |
7882 warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); | |
7883 sleep(3); | |
7884 } | |
7885 } | |
7886 elsif ($mates2){ | |
7887 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n"; | |
7888 } | |
7889 | |
7890 ### SINGLE-END MAPPING | |
7891 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified | |
7892 my $singles; | |
7893 unless ($mates1 and $mates2){ | |
7894 $singles = join (',',@ARGV); | |
7895 unless ($singles){ | |
7896 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n"; | |
7897 } | |
7898 $singles =~ s/\s/,/g; | |
7899 @filenames = (split(/,/,$singles)); | |
7900 warn "\nFiles to be analysed:\n"; | |
7901 warn "@filenames\n\n"; | |
7902 sleep (3); | |
7903 } | |
7904 | |
7905 ### MININUM INSERT SIZE (PAIRED-END ONLY) | |
7906 if (defined $minins){ | |
7907 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles); | |
7908 push @bowtie_options,"--minins $minins"; | |
7909 } | |
7910 | |
7911 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY) | |
7912 if (defined $maxins){ | |
7913 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles); | |
7914 push @bowtie_options,"--maxins $maxins"; | |
7915 } | |
7916 else{ | |
7917 unless ($singles){ | |
7918 push @bowtie_options,'--maxins 500'; | |
7919 } | |
7920 } | |
7921 | |
7922 ### QUIET prints nothing besides alignments (suppresses warnings) | |
7923 if ($quiet){ | |
7924 push @bowtie_options,'--quiet'; | |
7925 } | |
7926 | |
7927 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments | |
7928 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option | |
7929 if (defined $chunk){ | |
7930 push @bowtie_options,"--chunkmbs $chunk"; | |
7931 } | |
7932 else{ | |
7933 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default) | |
7934 } | |
7935 } | |
7936 | |
7937 | |
7938 ### SUMMARY OF ALL BOWTIE OPTIONS | |
7939 my $bowtie_options = join (' ',@bowtie_options); | |
7940 | |
7941 | |
7942 ### STRAND-SPECIFIC LIBRARIES | |
7943 my $directional; | |
7944 if ($non_directional){ | |
7945 die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat); | |
7946 warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n"; | |
7947 sleep (1); | |
7948 $directional = 0; | |
7949 } | |
7950 elsif($pbat){ | |
7951 die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip); | |
7952 die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta); | |
7953 | |
7954 warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n"; | |
7955 sleep (1); | |
7956 $directional = 0; | |
7957 } | |
7958 else{ | |
7959 warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n"; | |
7960 sleep (1); | |
7961 $directional = 1; # default behaviour | |
7962 } | |
7963 | |
7964 ### UNMAPPED SEQUENCE OUTPUT | |
7965 $unmapped = 0 unless ($unmapped); | |
7966 | |
7967 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT | |
7968 $multi_map = 0 unless ($multi_map); | |
7969 | |
7970 | |
7971 ### OUTPUT DIRECTORY | |
7972 | |
7973 chdir $parent_dir or die "Failed to move back to current working directory\n"; | |
7974 if ($output_dir){ | |
7975 unless ($output_dir =~ /\/$/){ | |
7976 $output_dir =~ s/$/\//; | |
7977 } | |
7978 | |
7979 if (chdir $output_dir){ | |
7980 $output_dir = getcwd; # making the path absolute | |
7981 unless ($output_dir =~ /\/$/){ | |
7982 $output_dir =~ s/$/\//; | |
7983 } | |
7984 } | |
7985 else{ | |
7986 mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; | |
7987 warn "Created output directory $output_dir!\n\n"; | |
7988 chdir $output_dir or die "Failed to move to $output_dir\n"; | |
7989 $output_dir = getcwd; # making the path absolute | |
7990 unless ($output_dir =~ /\/$/){ | |
7991 $output_dir =~ s/$/\//; | |
7992 } | |
7993 } | |
7994 warn "Output will be written into the directory: $output_dir\n"; | |
7995 } | |
7996 else{ | |
7997 $output_dir = ''; | |
7998 } | |
7999 | |
8000 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files | |
8001 | |
8002 chdir $parent_dir or die "Failed to move back to current working directory\n"; | |
8003 if ($temp_dir){ | |
8004 warn "\nUsing temp directory: $temp_dir\n"; | |
8005 unless ($temp_dir =~ /\/$/){ | |
8006 $temp_dir =~ s/$/\//; | |
8007 } | |
8008 | |
8009 if (chdir $temp_dir){ | |
8010 $temp_dir = getcwd; # making the path absolute | |
8011 unless ($temp_dir =~ /\/$/){ | |
8012 $temp_dir =~ s/$/\//; | |
8013 } | |
8014 } | |
8015 else{ | |
8016 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n"; | |
8017 warn "Created temporary directory $temp_dir!\n\n"; | |
8018 chdir $temp_dir or die "Failed to move to $temp_dir\n"; | |
8019 $temp_dir = getcwd; # making the path absolute | |
8020 unless ($temp_dir =~ /\/$/){ | |
8021 $temp_dir =~ s/$/\//; | |
8022 } | |
8023 } | |
8024 warn "Temporary files will be written into the directory: $temp_dir\n"; | |
8025 } | |
8026 else{ | |
8027 $temp_dir = ''; | |
8028 } | |
8029 | |
8030 ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE | |
8031 if ($non_bs_mm){ | |
8032 if ($vanilla){ | |
8033 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; | |
8034 } | |
8035 } | |
8036 | |
8037 ### PREFIX FOR OUTPUT FILES | |
8038 if ($prefix){ | |
8039 # removing trailing dots | |
8040 | |
8041 $prefix =~ s/\.+$//; | |
8042 | |
8043 warn "Using the following prefix for output files: $prefix\n\n"; | |
8044 sleep(1); | |
8045 } | |
8046 | |
8047 if (defined $multicore){ | |
8048 unless ($multicore > 0){ | |
8049 die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n"; | |
8050 } | |
8051 if ($multicore > 20){ | |
8052 warn "Core usage currently set to more than 20 threads. This might fail horribly but let's see how it goes... (set value: $multicore)\n\n"; | |
8053 } | |
8054 if ($sam){ | |
8055 die "The multicore function currently requires the output to be in BAM format, so please lose either option --sam or --multi\n"; | |
8056 } | |
8057 } | |
8058 else{ | |
8059 $multicore = 1; # default. Single-thread mode | |
8060 warn "Setting parallelization to single-threaded (default)\n\n"; | |
8061 } | |
8062 | |
8063 if ($basename and $multicore > 1){ | |
8064 die "Specifying --basename in conjuction with --multicore is currently not supported (but we are aiming to fix this soon). Please lose either --basename or --multicore to proceed\n\n"; | |
8065 } | |
8066 | |
8067 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index_present,$multicore); | |
8068 } | |
8069 | |
8070 | |
8071 | |
8072 sub generate_SAM_header{ | |
8073 | |
8074 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order | |
8075 | |
8076 # Unordered printing of @SQ headers | |
8077 # foreach my $chr (keys %chromosomes){ | |
8078 # my $length = length ($chromosomes{$chr}); | |
8079 # print "\@SQ\tSN:$chr\tLN:$length\n"; | |
8080 # print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length | |
8081 # } | |
8082 | |
8083 foreach my $chr (sort {$a<=>$b} keys %SQ_order){ | |
8084 # warn "$chr\t$SQ_order{$chr}\n"; | |
8085 my $length = length ($chromosomes{$SQ_order{$chr}}); | |
8086 print OUT "\@SQ\tSN:$SQ_order{$chr}\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length | |
8087 } | |
8088 | |
8089 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version | |
8090 | |
8091 } | |
8092 | |
8093 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format: | |
8094 ### O. Tam (2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011), A. Dei Rossi (2014) | |
8095 | |
8096 sub single_end_SAM_output{ | |
8097 | |
8098 my ($id,$actual_seq,$methylation_call_params,$qual) = @_; | |
8099 my $strand = $methylation_call_params->{$id}->{alignment_strand}; | |
8100 my $chr = $methylation_call_params->{$id}->{chromosome}; | |
8101 my $start = $methylation_call_params->{$id}->{position}; | |
8102 my $stop = $methylation_call_params->{$id}->{end_position}; | |
8103 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence}; | |
8104 my $methcall = $methylation_call_params->{$id}->{methylation_call}; | |
8105 my $read_conversion = $methylation_call_params->{$id}->{read_conversion}; | |
8106 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; | |
8107 my $number_of_mismatches; | |
8108 | |
8109 if ($bowtie2){ | |
8110 $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score}; | |
8111 } | |
8112 else{ | |
8113 $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches}; | |
8114 } | |
8115 | |
8116 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" | |
8117 ## FLAG: bitwise FLAG. Each bit is explained in the following table: | |
8118 ## Bit Description Comment Value | |
8119 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1) | |
8120 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2) | |
8121 ## 0x4 segment unmapped --- --- | |
8122 ## 0x8 next segment in the template unmapped --- --- | |
8123 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16) | |
8124 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32) | |
8125 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64) | |
8126 ## 0x80 the last segment in the template read 2 value: 2**7 (128) | |
8127 ## 0x100 secondary alignment --- --- | |
8128 ## 0x200 not passing quality controls --- --- | |
8129 ## 0x400 PCR or optical duplicate --- --- | |
8130 | |
8131 ##### | |
8132 | |
8133 my $flag; # FLAG variable used for SAM format. | |
8134 if ($strand eq "+"){ | |
8135 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ | |
8136 $flag = 0; # 0 for "+" strand (OT) | |
8137 } | |
8138 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ | |
8139 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand) | |
8140 } | |
8141 else{ | |
8142 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; | |
8143 } | |
8144 } | |
8145 elsif ($strand eq "-"){ | |
8146 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ | |
8147 $flag = 16; # 16 for "-" strand (OB) | |
8148 } | |
8149 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ | |
8150 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand) | |
8151 } | |
8152 else{ | |
8153 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; | |
8154 } | |
8155 } | |
8156 else{ | |
8157 die "Unexpected strand information: $strand\n\n"; | |
8158 } | |
8159 | |
8160 ##### | |
8161 | |
8162 my $mapq; | |
8163 | |
8164 if ($bowtie2){ | |
8165 $mapq = $methylation_call_params->{$id}->{mapq}; | |
8166 } | |
8167 else{ | |
8168 $mapq = 255; # Mapping quality is unavailable for use with Bowtie | |
8169 } | |
8170 | |
8171 ##### | |
8172 | |
8173 my $cigar; | |
8174 if ($bowtie2){ | |
8175 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2 | |
8176 } | |
8177 else{ | |
8178 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches) | |
8179 } | |
8180 | |
8181 ##### | |
8182 | |
8183 my $rnext = "*"; # Paired-end variable | |
8184 | |
8185 ##### | |
8186 | |
8187 my $pnext = 0; # Paired-end variable | |
8188 | |
8189 ##### | |
8190 | |
8191 my $tlen = 0; # Paired-end variable | |
8192 | |
8193 ##### | |
8194 | |
8195 if ($read_conversion eq 'CT'){ | |
8196 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands | |
8197 } | |
8198 else{ | |
8199 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries | |
8200 } | |
8201 | |
8202 if ($strand eq '-'){ | |
8203 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand | |
8204 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence | |
8205 if ($cigar =~ /D/){ | |
8206 $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} ); | |
8207 } | |
8208 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well | |
8209 } | |
8210 | |
8211 ##### | |
8212 | |
8213 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string | |
8214 # into the reference string. hemming_dist() | |
8215 if ($bowtie2){ | |
8216 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence | |
8217 } | |
8218 | |
8219 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences | |
8220 | |
8221 ##### | |
8222 | |
8223 my $MD_tag = make_mismatch_string($actual_seq, $ref_seq,$cigar,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag}); # Optional tag MD: string providing mismatched reference bases in the alignment (this does include indel information) | |
8224 # my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!) | |
8225 | |
8226 ##### | |
8227 | |
8228 my $XM_tag; # Optional tag XM: Methylation Call String | |
8229 if ($strand eq '+'){ | |
8230 $XM_tag = "XM:Z:$methcall"; | |
8231 } | |
8232 elsif ($strand eq '-'){ | |
8233 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well | |
8234 } | |
8235 | |
8236 ##### | |
8237 | |
8238 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion | |
8239 | |
8240 ##### | |
8241 | |
8242 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion | |
8243 | |
8244 ##### | |
8245 | |
8246 # Optionally calculating number of mismatches for Bowtie 2 alignments | |
8247 | |
8248 if ($non_bs_mm) { | |
8249 if ($bowtie2) { | |
8250 | |
8251 $number_of_mismatches =~ s/-//; # removing the minus sign | |
8252 | |
8253 ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches | |
8254 if ($cigar =~ /(D|I)/) { | |
8255 # warn "$cigar\n"; | |
8256 | |
8257 # parsing CIGAR string | |
8258 my @len = split (/\D+/,$cigar); # storing the length per operation | |
8259 my @ops = split (/\d+/,$cigar); # storing the operation | |
8260 shift @ops; # remove the empty first element | |
8261 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
8262 | |
8263 foreach (0..$#len) { | |
8264 if ($ops[$_] eq 'M') { | |
8265 # warn "skipping\n"; | |
8266 next; # irrelevant | |
8267 } | |
8268 elsif ($ops[$_] eq 'I') { # insertion in the read sequence | |
8269 $number_of_mismatches -= $insertion_open; | |
8270 $number_of_mismatches -= $len[$_] * $insertion_extend; | |
8271 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; | |
8272 } | |
8273 elsif ($ops[$_] eq 'D') { # deletion in the read sequence | |
8274 $number_of_mismatches -= $deletion_open; | |
8275 $number_of_mismatches -= $len[$_] * $deletion_extend; | |
8276 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; | |
8277 } | |
8278 elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die | |
8279 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
8280 } | |
8281 else { | |
8282 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
8283 } | |
8284 } | |
8285 # warn "Alignment score $number_of_mismatches\n"; | |
8286 # print "Mismatches $number_of_mismatches\n\n"; | |
8287 } | |
8288 ### Now we have InDel corrected alignment scores | |
8289 | |
8290 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the | |
8291 ### sequence contained more than 5 Ns, but this should occur close to never | |
8292 | |
8293 my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division | |
8294 # warn "N count: $seq_N_count\n"; | |
8295 $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count; | |
8296 # warn "MM $number_of_mismatches\n"; | |
8297 } | |
8298 } | |
8299 | |
8300 #### | |
8301 | |
8302 my $XA_tag = "XA:Z:$number_of_mismatches"; | |
8303 | |
8304 ##### | |
8305 | |
8306 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields | |
8307 ### optionally print number of non-bisulfite mismatches | |
8308 if ($non_bs_mm){ | |
8309 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n"; | |
8310 } | |
8311 else{ # default | |
8312 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields | |
8313 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag)),"\n"; | |
8314 } | |
8315 } | |
8316 | |
8317 sub paired_end_SAM_output{ | |
8318 | |
8319 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_; | |
8320 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand | |
8321 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2}; | |
8322 my $chr = $methylation_call_params->{$id}->{chromosome}; | |
8323 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1}; | |
8324 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2}; | |
8325 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1}; | |
8326 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; | |
8327 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; | |
8328 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; | |
8329 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; | |
8330 | |
8331 my $id_1; | |
8332 my $id_2; | |
8333 | |
8334 if ($old_flag){ | |
8335 $id_1 = $id.'/1'; | |
8336 $id_2 = $id.'/2'; | |
8337 } | |
8338 else{ | |
8339 $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard | |
8340 $id_2 = $id; | |
8341 } | |
8342 | |
8343 # Allows all degenerate nucleotide sequences in reference genome | |
8344 # die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHVX]/i; # X are padded nucleotides in case of insertions in the read | |
8345 # die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHVX]/i; | |
8346 | |
8347 my $index; # used to store the srand origin of the alignment in a less convoluted way | |
8348 | |
8349 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){ | |
8350 $index = 0; ## this is OT (original top strand) | |
8351 } | |
8352 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){ | |
8353 $index = 1; ## this is CTOB (complementary to OB) | |
8354 } | |
8355 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){ | |
8356 $index = 2; ## this is CTOT (complementary to OT) | |
8357 } | |
8358 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){ | |
8359 $index = 3; ## this is OB (original bottom) | |
8360 } | |
8361 else { | |
8362 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n"; | |
8363 } | |
8364 | |
8365 my $number_of_mismatches_1; | |
8366 my $number_of_mismatches_2; | |
8367 | |
8368 if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine | |
8369 $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default! | |
8370 $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2}; | |
8371 } | |
8372 else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation | |
8373 if ($index == 2 or $index == 3){ # CTOT or OB | |
8374 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default! | |
8375 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1}; | |
8376 } | |
8377 else{ # if the first read aligned in forward direction it is like for Bowtie 2 | |
8378 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default! | |
8379 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2}; | |
8380 } | |
8381 } | |
8382 | |
8383 | |
8384 | |
8385 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the | |
8386 ### first or last position. | |
8387 | |
8388 if ($index == 0 or $index == 3){ # OT or OB | |
8389 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2); | |
8390 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2); | |
8391 } | |
8392 else{ # CTOT or CTOB | |
8393 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2); | |
8394 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2); | |
8395 } | |
8396 | |
8397 ##### | |
8398 | |
8399 my $start_read_1; | |
8400 my $start_read_2; | |
8401 # adjusting end positions | |
8402 | |
8403 if ($bowtie2){ | |
8404 $start_read_1 = $methylation_call_params->{$id}->{position_1}; | |
8405 $start_read_2 = $methylation_call_params->{$id}->{position_2}; | |
8406 } | |
8407 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 | |
8408 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand | |
8409 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1}; | |
8410 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1; | |
8411 } | |
8412 else{ # read 1 is on the - strand | |
8413 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1; | |
8414 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1}; | |
8415 } | |
8416 } | |
8417 | |
8418 ##### | |
8419 | |
8420 my $end_read_1; | |
8421 my $end_read_2; | |
8422 # adjusting end positions | |
8423 | |
8424 if ($bowtie2){ | |
8425 $end_read_1 = $methylation_call_params->{$id}->{end_position_1}; | |
8426 $end_read_2 = $methylation_call_params->{$id}->{end_position_2}; | |
8427 } | |
8428 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 | |
8429 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand | |
8430 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1; | |
8431 $end_read_2 = $methylation_call_params->{$id}->{alignment_end}; | |
8432 } | |
8433 else{ | |
8434 $end_read_1 = $methylation_call_params->{$id}->{alignment_end}; | |
8435 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1; | |
8436 } | |
8437 } | |
8438 | |
8439 ##### | |
8440 | |
8441 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" | |
8442 ## FLAG: bitwise FLAG. Each bit is explained in the following table: | |
8443 ## Bit Description Comment Value | |
8444 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1) | |
8445 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2) | |
8446 ## 0x4 segment unmapped --- --- | |
8447 ## 0x8 next segment in the template unmapped --- --- | |
8448 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16) | |
8449 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32) | |
8450 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64) | |
8451 ## 0x80 the last segment in the template read 2 value: 2^^7 (128) | |
8452 ## 0x100 secondary alignment --- --- | |
8453 ## 0x200 not passing quality controls --- --- | |
8454 ## 0x400 PCR or optical duplicate --- --- | |
8455 | |
8456 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account | |
8457 | |
8458 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) | |
8459 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) | |
8460 | |
8461 my $flag_1; # FLAG variable used for SAM format | |
8462 my $flag_2; | |
8463 | |
8464 ### The new default FLAG values have been suggested by Peter Hickey, Australia | |
8465 | |
8466 if ($index == 0){ # OT | |
8467 unless ($old_flag){ | |
8468 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) | |
8469 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) | |
8470 } | |
8471 else{ | |
8472 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) | |
8473 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) | |
8474 } | |
8475 } | |
8476 elsif ($index == 1){ # CTOB | |
8477 unless($old_flag){ | |
8478 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) | |
8479 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) | |
8480 } | |
8481 else{ | |
8482 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) | |
8483 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) | |
8484 } | |
8485 } | |
8486 elsif ($index == 2){ # CTOT | |
8487 unless ($old_flag){ | |
8488 $flag_1 = 99; # PH: Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) | |
8489 $flag_2 = 147; # PH: Read 2 is on - strand but informative for the OT (1+2+16+128) | |
8490 } | |
8491 else{ | |
8492 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) | |
8493 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) | |
8494 } | |
8495 } | |
8496 elsif ($index == 3){ # OB | |
8497 unless ($old_flag){ | |
8498 $flag_1 = 83; # PH: Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) | |
8499 $flag_2 = 163; # PH: read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) | |
8500 } | |
8501 else{ | |
8502 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) | |
8503 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) | |
8504 } | |
8505 } | |
8506 | |
8507 ##### | |
8508 | |
8509 my $mapq; | |
8510 | |
8511 if ($bowtie2){ | |
8512 $mapq = $methylation_call_params->{$id}->{mapq}; | |
8513 } | |
8514 else{ | |
8515 $mapq = 255; # Mapping quality is unavailable for use with Bowtie | |
8516 } | |
8517 | |
8518 ##### | |
8519 | |
8520 my $cigar_1; | |
8521 my $cigar_2; | |
8522 | |
8523 if ($bowtie2){ | |
8524 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2 | |
8525 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2}; | |
8526 } | |
8527 else{ | |
8528 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches) | |
8529 $cigar_2 = length($actual_seq_2) . "M"; | |
8530 } | |
8531 | |
8532 ##### | |
8533 | |
8534 my $rnext = '='; # Chromosome of mate; applies to both reads | |
8535 | |
8536 ##### | |
8537 | |
8538 my $pnext_1 = $start_read_2; # Leftmost position of mate | |
8539 my $pnext_2 = $start_read_1; | |
8540 | |
8541 ##### | |
8542 | |
8543 my $tlen_1; # signed observed Template LENgth (or inferred fragment size) | |
8544 my $tlen_2; | |
8545 | |
8546 if ($bowtie2){ | |
8547 | |
8548 if ($start_read_1 <= $start_read_2){ | |
8549 | |
8550 # Read 1 alignment is leftmost | |
8551 | |
8552 if ($end_read_2 >= $end_read_1){ | |
8553 | |
8554 # -------------------------> read 1 reads overlapping | |
8555 # <------------------------- read 2 | |
8556 # | |
8557 # or | |
8558 # | |
8559 # -------------------------> read 1 | |
8560 # <----------------------- read 2 read 2 contained within read 1 | |
8561 # | |
8562 # or | |
8563 # | |
8564 # -------------------------> read 1 reads 1 and 2 exactly overlapping | |
8565 # <------------------------- read 2 | |
8566 # | |
8567 | |
8568 # dovetailing of reads is not enabled for Bowtie 2 alignments | |
8569 | |
8570 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, | |
8571 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign | |
8572 } | |
8573 elsif ($end_read_2 < $end_read_1){ | |
8574 | |
8575 # -------------------------> read 1 | |
8576 # <----------- read 2 read 2 contained within read 1 | |
8577 # | |
8578 # or | |
8579 # | |
8580 # -------------------------> read 1 | |
8581 # <------------------------ read 2 read 2 contained within read 1 | |
8582 | |
8583 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable | |
8584 $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, | |
8585 $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this | |
8586 ### as a request by frozenlyse on SeqAnswers on 24 July 2013 | |
8587 } | |
8588 | |
8589 } | |
8590 | |
8591 elsif ($start_read_2 < $start_read_1){ | |
8592 | |
8593 if ($end_read_1 >= $end_read_2){ | |
8594 | |
8595 # Read 2 alignment is leftmost | |
8596 | |
8597 # -------------------------> read 2 reads overlapping | |
8598 # <------------------------- read 1 | |
8599 # | |
8600 # or | |
8601 # | |
8602 # -------------------------> read 2 | |
8603 # <----------------------- read 1 read 1 contained within read 2 | |
8604 # | |
8605 # | |
8606 | |
8607 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, | |
8608 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign | |
8609 } | |
8610 elsif ($end_read_1 < $end_read_2){ | |
8611 | |
8612 # -------------------------> read 2 | |
8613 # <----------- read 1 read 1 contained within read 2 | |
8614 # | |
8615 # or | |
8616 # | |
8617 # -------------------------> read 2 | |
8618 # <------------------------ read 1 read 1 contained within read 2 | |
8619 | |
8620 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable | |
8621 $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, | |
8622 $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this | |
8623 ### as a request by frozenlyse on SeqAnswers on 24 July 2013 | |
8624 } | |
8625 } | |
8626 } | |
8627 | |
8628 else{ # Bowtie 1 | |
8629 | |
8630 if ($end_read_2 >= $end_read_1){ | |
8631 # Read 1 alignment is leftmost | |
8632 # -------------------------> read 1 | |
8633 # <------------------------- read 2 | |
8634 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing | |
8635 | |
8636 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, | |
8637 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign | |
8638 } | |
8639 else{ | |
8640 # Read 2 alignment is leftmost | |
8641 # -------------------------> read 2 | |
8642 # <------------------------- read 1 | |
8643 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing | |
8644 | |
8645 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, | |
8646 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign | |
8647 } | |
8648 } | |
8649 | |
8650 ##### | |
8651 | |
8652 # adjusting the strand of the sequence before we use them to generate mismatch strings | |
8653 if ($strand_1 eq '-'){ | |
8654 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand | |
8655 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence | |
8656 if ($cigar_1 =~ /D/){ | |
8657 $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} ); | |
8658 } | |
8659 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well | |
8660 } | |
8661 if ($strand_2 eq '-'){ | |
8662 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand | |
8663 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence | |
8664 if ($cigar_2 =~ /D/){ | |
8665 $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} ); | |
8666 } | |
8667 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well | |
8668 } | |
8669 | |
8670 # print "$actual_seq_1\n$ref_seq_1\n\n"; | |
8671 # print "$actual_seq_2\n$ref_seq_2\n\n"; | |
8672 | |
8673 ##### | |
8674 | |
8675 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence | |
8676 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2); | |
8677 if ($bowtie2){ | |
8678 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence | |
8679 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence | |
8680 } | |
8681 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences | |
8682 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences | |
8683 | |
8684 ##### | |
8685 | |
8686 my $MD_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1,$cigar_1,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1}); # Optional tag MD: String providing mismatched reference bases in the alignment (including indel information) | |
8687 my $MD_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2,$cigar_2,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2}); | |
8688 | |
8689 # my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!) | |
8690 # my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2); | |
8691 | |
8692 ##### | |
8693 | |
8694 my $XM_tag_1; # Optional tag XM: Methylation call string | |
8695 my $XM_tag_2; | |
8696 | |
8697 if ($strand_1 eq '-'){ | |
8698 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented | |
8699 } | |
8700 else{ | |
8701 $XM_tag_1 = "XM:Z:$methcall_1"; | |
8702 } | |
8703 | |
8704 if ($strand_2 eq '-'){ | |
8705 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented | |
8706 } | |
8707 else{ | |
8708 $XM_tag_2 = "XM:Z:$methcall_2"; | |
8709 } | |
8710 | |
8711 ##### | |
8712 | |
8713 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state | |
8714 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state | |
8715 | |
8716 ##### | |
8717 | |
8718 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads | |
8719 | |
8720 ##### | |
8721 | |
8722 # Optionally calculating number of mismatches for Bowtie 2 alignments | |
8723 | |
8724 if ($non_bs_mm) { | |
8725 if ($bowtie2) { | |
8726 | |
8727 $number_of_mismatches_1 =~ s/-//; # removing the minus sign | |
8728 $number_of_mismatches_2 =~ s/-//; | |
8729 | |
8730 ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches | |
8731 | |
8732 ### CIGAR 1 | |
8733 if ($cigar_1 =~ /(D|I)/) { | |
8734 # warn "$cigar_1\n"; | |
8735 | |
8736 # parsing CIGAR string | |
8737 my @len = split (/\D+/,$cigar_1); # storing the length per operation | |
8738 my @ops = split (/\d+/,$cigar_1); # storing the operation | |
8739 shift @ops; # remove the empty first element | |
8740 die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
8741 | |
8742 foreach (0..$#len) { | |
8743 if ($ops[$_] eq 'M') { | |
8744 # warn "skipping\n"; | |
8745 next; # irrelevant | |
8746 } | |
8747 elsif ($ops[$_] eq 'I') { # insertion in the read sequence | |
8748 $number_of_mismatches_1 -= $insertion_open; | |
8749 $number_of_mismatches_1 -= $len[$_] * $insertion_extend; | |
8750 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; | |
8751 } | |
8752 elsif ($ops[$_] eq 'D') { # deletion in the read sequence | |
8753 $number_of_mismatches_1 -= $deletion_open; | |
8754 $number_of_mismatches_1 -= $len[$_] * $deletion_extend; | |
8755 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; | |
8756 } | |
8757 elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die | |
8758 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
8759 } | |
8760 else { | |
8761 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
8762 } | |
8763 } | |
8764 | |
8765 # warn "Alignment score $number_of_mismatches_1\n"; | |
8766 # print "Mismatches $number_of_mismatches_1\n\n"; | |
8767 } | |
8768 | |
8769 ### CIGAR 2 | |
8770 if ($cigar_2 =~ /(D|I)/) { | |
8771 # warn "$cigar_2\n"; | |
8772 | |
8773 # parsing CIGAR string | |
8774 my @len = split (/\D+/,$cigar_2); # storing the length per operation | |
8775 my @ops = split (/\d+/,$cigar_2); # storing the operation | |
8776 shift @ops; # remove the empty first element | |
8777 die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
8778 | |
8779 foreach (0..$#len) { | |
8780 if ($ops[$_] eq 'M') { | |
8781 # warn "skipping\n"; | |
8782 next; #irrelevant | |
8783 } | |
8784 elsif ($ops[$_] eq 'I') { # insertion in the read sequence | |
8785 $number_of_mismatches_2 -= $insertion_open; | |
8786 $number_of_mismatches_2 -= $len[$_] * $insertion_extend; | |
8787 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; | |
8788 } | |
8789 elsif ($ops[$_] eq 'D') { # deletion in the read sequence | |
8790 $number_of_mismatches_2 -= $deletion_open; | |
8791 $number_of_mismatches_2 -= $len[$_] * $deletion_extend; | |
8792 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; | |
8793 } | |
8794 elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die | |
8795 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
8796 } | |
8797 else { | |
8798 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
8799 } | |
8800 } | |
8801 } | |
8802 | |
8803 ### Now we have InDel corrected Alignment scores | |
8804 | |
8805 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the | |
8806 ### sequence contained more than 5 Ns, but this should occur close to never | |
8807 | |
8808 my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division | |
8809 my $seq_2_N_count = $number_of_mismatches_2 % 6; | |
8810 # warn "N count 1: $seq_1_N_count\n"; | |
8811 # warn "N count 2: $seq_2_N_count\n"; | |
8812 | |
8813 $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count; | |
8814 $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count; | |
8815 | |
8816 # warn "MM1 $number_of_mismatches_1 \n"; | |
8817 # warn "MM2 $number_of_mismatches_2 \n"; | |
8818 } | |
8819 } | |
8820 | |
8821 #### | |
8822 | |
8823 my $XA_tag = "XA:Z:$number_of_mismatches_1"; | |
8824 my $XB_tag = "XB:Z:$number_of_mismatches_2"; | |
8825 | |
8826 | |
8827 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields | |
8828 ### optionally print number of non-bisulfite mismatches | |
8829 if ($non_bs_mm){ | |
8830 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n"; | |
8831 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n"; | |
8832 } | |
8833 else{ # default | |
8834 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n"; | |
8835 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n"; | |
8836 } | |
8837 } | |
8838 | |
8839 sub revcomp{ | |
8840 my $seq = shift or die "Missing seq to reverse complement\n"; | |
8841 $seq = reverse $seq; | |
8842 $seq =~ tr/ACTGactg/TGACTGAC/; | |
8843 return $seq; | |
8844 } | |
8845 | |
8846 sub hemming_dist{ | |
8847 my $matches = 0; | |
8848 my @actual_seq = split //,(shift @_); | |
8849 my @ref_seq = split //,(shift @_); | |
8850 | |
8851 foreach (0..$#actual_seq){ | |
8852 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]); | |
8853 } | |
8854 return my $hd = scalar @actual_seq - $matches; | |
8855 } | |
8856 | |
8857 | |
8858 ### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might | |
8859 ### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels | |
8860 ### 05 June 2014 | |
8861 | |
8862 | |
8863 sub make_mismatch_string{ | |
8864 my ($actual_seq,$ref_seq,$cigar,$md_sequence) = @_; | |
8865 | |
8866 my $MD_tag = "MD:Z:"; | |
8867 my $prev_matching = 0; | |
8868 my $last_char; | |
8869 | |
8870 my $ref_base; | |
8871 my $actual_base; | |
8872 | |
8873 foreach my $pos ( 0..(length$actual_seq) - 1 ){ | |
8874 | |
8875 $actual_base = substr($actual_seq,$pos,1); | |
8876 $ref_base = substr($ref_seq,$pos,1); | |
8877 # if ($verbose){ warn "reference: $ref_base\tseen base: $actual_base\n";} | |
8878 | |
8879 if ( $actual_base eq $ref_base ){ | |
8880 ++$prev_matching; | |
8881 } | |
8882 else{ | |
8883 # If the mismatch is due to an insertion we simply move on, else we print the previously matching bases as well as the mismatching genomic base | |
8884 if ($ref_base eq 'X'){ | |
8885 # if ($verbose){ warn "The genome base was an artificually padded '$ref_base' due to an insertion in the read at this position. Just ignoring it for the MD tag\n"; sleep(1);} | |
8886 } | |
8887 else{ | |
8888 # if ($verbose){ warn "previous matching bases: $prev_matching\n";} | |
8889 | |
8890 ### There is a mismatch between the sequence and the genome. First we need to write out how may bases matched until now | |
8891 if ($prev_matching == 0){ | |
8892 # if ($verbose){ warn "Got a mismatch either at the very start or next to another mismatch. Need to add a padding 0 as well as the mismatch\n";} | |
8893 # if ($verbose){ warn "${prev_matching}$ref_base\n";} | |
8894 $MD_tag .= $prev_matching; | |
8895 $MD_tag .= $ref_base; | |
8896 } | |
8897 else{ | |
8898 # if ($verbose){ warn "${prev_matching}$ref_base\n";} | |
8899 $MD_tag .= $prev_matching; | |
8900 $MD_tag .= $ref_base; | |
8901 } | |
8902 | |
8903 $prev_matching = 0; # resetting $prev_matching | |
8904 } | |
8905 | |
8906 } | |
8907 | |
8908 } | |
8909 ### appending the number of matches one last time | |
8910 $MD_tag .= $prev_matching; | |
8911 | |
8912 | |
8913 ### If the read contains deletion(s) we need to take care of these in the MD-tag as well | |
8914 if ($cigar =~ /D/){ | |
8915 my $deletions_total = 0; | |
8916 while ($cigar =~ /D/g){ | |
8917 ++$deletions_total; | |
8918 } | |
8919 if ($verbose){ warn "Read contains $deletions_total deletions in total\n\n";} | |
8920 | |
8921 if ($verbose){ warn "There was a deletion in the read!\n";} | |
8922 if ($verbose){ warn "actual:\t$actual_seq\nref:\t$ref_seq\nMD-seq:\t$md_sequence\nMD-tag: $MD_tag\n";} | |
8923 | |
8924 # parsing CIGAR string | |
8925 my @len = split (/\D+/,$cigar); # storing the length per operation | |
8926 my @ops = split (/\d+/,$cigar); # storing the operation | |
8927 shift @ops; # remove the empty first element | |
8928 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
8929 | |
8930 my $MD_pos_so_far = 0; | |
8931 my $deletions_processed = 0; | |
8932 my $del_pos = 0; | |
8933 my $deleted_bases = ''; | |
8934 my $new_MD = $1 if ($MD_tag =~ /MD:Z:(.*)/); | |
8935 my $md_index_already_processed; | |
8936 | |
8937 my @md = split //,$new_MD; | |
8938 | |
8939 if ($verbose){ warn "New MD-tag: $new_MD\n\n";} | |
8940 $MD_tag = "MD:Z:"; ### reconstituting a new MD-tag | |
8941 $new_MD = ''; # using this to build up a new string that will replace the old \@md | |
8942 | |
8943 if ($verbose){ warn "CIGAR string; $cigar\n";} | |
8944 ### determining end position of a read | |
8945 foreach my $index(0..$#len){ | |
8946 | |
8947 if ($ops[$index] eq 'M'){ # matching bases | |
8948 $del_pos += $len[$index]; | |
8949 if ($verbose){ warn "Operation is 'M', adding $len[$index] bp\n";} | |
8950 } | |
8951 elsif($ops[$index] eq 'I'){ # insertion | |
8952 $del_pos += $len[$index]; | |
8953 ### need to add insertions in the read to MD pos so far! | |
8954 $MD_pos_so_far += $len[$index]; | |
8955 if ($verbose){ warn "Operation is 'I', adding $len[$index] bp\n";} | |
8956 } | |
8957 elsif($ops[$index] eq 'D'){ # deletion | |
8958 if ($verbose){ warn "Operation is 'D', extracting $len[$index] bp\n";} | |
8959 $deleted_bases = substr($md_sequence,$del_pos,$len[$index]); | |
8960 if ($verbose){ warn "Deleted bases: $deleted_bases\n\n";} | |
8961 | |
8962 ### Now we need to process the MD-tag so far and write out everything up until this point, inlcuding the deletion | |
8963 if ($verbose){ warn "Now processing the MD-tag\n";} | |
8964 my $op; | |
8965 | |
8966 my $this_deletion_processed; | |
8967 my $md_processed_so_far; | |
8968 my $current_md_index; | |
8969 | |
8970 foreach my $el (@md){ | |
8971 | |
8972 unless (defined $current_md_index){ | |
8973 $current_md_index = 0; # first element = index 0 | |
8974 } | |
8975 else{ | |
8976 ++$current_md_index; | |
8977 } | |
8978 | |
8979 if ($md_index_already_processed and ($current_md_index <= $md_index_already_processed)){ | |
8980 if ($verbose){ warn "This has to be another deletion within the same read. Currently processing index $current_md_index, but have already processed $md_index_already_processed indexes previously\n";} | |
8981 $new_MD .= $el; | |
8982 next; | |
8983 } | |
8984 | |
8985 if ($verbose){ warn "Current element: $el\n";} | |
8986 unless (defined $op){ # initialize | |
8987 $op = $el; | |
8988 if ($verbose){ warn "Initializing \$op as $op\n";} | |
8989 next; | |
8990 } | |
8991 | |
8992 if ($deletions_processed == $deletions_total){ | |
8993 if ($verbose){ warn "Processed $deletions_processed in the read so far, out of $deletions_total total. Just appending elements until the end of the string: here $el\n";} | |
8994 $MD_tag .= $el; | |
8995 $new_MD .= $el; | |
8996 next; | |
8997 } | |
8998 # this only occurs when there are more deletions in the read but we want to regenerate a new MD tag | |
8999 if ($this_deletion_processed){ | |
9000 $new_MD .= $el; | |
9001 next; | |
9002 } | |
9003 | |
9004 if ($op =~ /^\d+$/){ | |
9005 if ($verbose){ warn "Operation so far was a digit: $op\n";} | |
9006 if ($el =~ /\d/){ | |
9007 $op .= $el; | |
9008 if ($verbose){ warn "Appending current operation $el. New operation is: $op\n";} | |
9009 next; | |
9010 } | |
9011 else{ | |
9012 if ($verbose){ warn "current element is a word character: $el\n";} | |
9013 | |
9014 ### Need to determine if the matching operation length includes the deletion position | |
9015 if ($verbose){ warn "Processing operation $op and adding it to MD pos which is so far: $MD_pos_so_far; deletion pos is $del_pos.\n";} | |
9016 $MD_pos_so_far += $op; | |
9017 if ($verbose){ warn "MD pos so far: $MD_pos_so_far\n";} | |
9018 if ($MD_pos_so_far < $del_pos){ | |
9019 if ($verbose){ warn "Doesn't cover the deletion yet. Writing back out.\n";} | |
9020 $MD_tag .= $op; | |
9021 $new_MD .= $op; | |
9022 if ($verbose){ warn "Setting new operation to: $el\n";} | |
9023 $op = $el; # setting new $op | |
9024 } | |
9025 else{ | |
9026 if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";} | |
9027 ### splitting up the number of matching bases in number before and after the deletion | |
9028 | |
9029 my $pos_after_deletion = $MD_pos_so_far - $del_pos; | |
9030 my $pos_before_deletion = $op - $pos_after_deletion; | |
9031 if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";} | |
9032 $MD_tag .= "${pos_before_deletion}^${deleted_bases}"; | |
9033 $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}"; | |
9034 if ($verbose){ warn "\$newMD after adjusting for the current deletion: $new_MD\n";} | |
9035 | |
9036 #adjusting the MD_position by the number of bases after the deletion | |
9037 $MD_pos_so_far -= $pos_after_deletion; | |
9038 if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; } | |
9039 ### also appending the current element because we are writing out the rest of the MD-string unchanged to $new_MD | |
9040 $new_MD .= $el; | |
9041 | |
9042 $deletions_processed += 1; | |
9043 $this_deletion_processed = 1; | |
9044 | |
9045 if ($deletions_processed == $deletions_total){ # this was the last deletion of the read | |
9046 if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";} | |
9047 $MD_tag .= "${pos_after_deletion}"; | |
9048 | |
9049 ### also appending the current element because we are writing out the rest of the MD-string unchanged | |
9050 if ($verbose){ warn "also appending the current element $el\n";} | |
9051 $MD_tag .= $el; | |
9052 ### Finally also adding the length of the deletion to $del_pos | |
9053 $del_pos += $len[$index]; | |
9054 if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} | |
9055 } | |
9056 else{ | |
9057 if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";} | |
9058 if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";} | |
9059 | |
9060 | |
9061 ### This migh need looking at!! | |
9062 | |
9063 $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op; | |
9064 if ($verbose){ warn "Current index = $current_md_index\n";} | |
9065 | |
9066 if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";} | |
9067 $md_index_already_processed = $current_md_index - 1; | |
9068 | |
9069 if ($verbose){ warn "Exiting now and waiting for the next deletion\n";} | |
9070 | |
9071 ### Finally also adding the length of the deletion to $del_pos | |
9072 $del_pos += $len[$index]; | |
9073 $MD_pos_so_far += $len[$index]; | |
9074 if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} | |
9075 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} | |
9076 #setting $op to en empty string so it is not being processed as the last element | |
9077 $op = ''; | |
9078 # last; # exiting the loop and processing the CIGAR string further until we hit the next deletion | |
9079 } | |
9080 } | |
9081 } | |
9082 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} | |
9083 } | |
9084 else{ | |
9085 if ($verbose){ warn "Operation so far was a word character: $op\n";} | |
9086 if ($el =~ /\d+/){ | |
9087 # processing the previous mismatch position | |
9088 $MD_tag .= $op; | |
9089 $new_MD .= $op; | |
9090 $MD_pos_so_far += length($op); | |
9091 if ($verbose){ warn "Writing out mismatching base $op and adding length ",length($op),"\n";} | |
9092 } | |
9093 else{ | |
9094 # this should never occur since mismatches are followed by a 0 or another digit | |
9095 die "current element is a another word character: $el. This should never happen!\n"; | |
9096 } | |
9097 if ($verbose){ warn "Setting new operation to: $el\n";} | |
9098 $op = $el; # setting new $op | |
9099 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} | |
9100 } | |
9101 } | |
9102 | |
9103 ### need to consider last element if it was a digit or number and we are expecting the deletion in the last element of the MD-tag | |
9104 if ($op =~ /\d+/ and $deletions_processed < $deletions_total){ | |
9105 if ($verbose){ warn "\n\nlast operation was $op\n";} | |
9106 if ($verbose){ warn "Processing operation $op; deletion pos is $del_pos. MD so far was: $MD_pos_so_far\n";} | |
9107 | |
9108 $MD_pos_so_far += $op; | |
9109 if ($verbose){ warn "Adding $op to MD pos so far: $MD_pos_so_far\n";} | |
9110 if ($verbose){ warn "Deletions already processed: $deletions_processed, del total: $deletions_total\n\n";} | |
9111 if ($MD_pos_so_far >= $del_pos){ | |
9112 if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";} | |
9113 ### splitting up the number of matching bases in number before and after the deletion | |
9114 | |
9115 my $pos_after_deletion = $MD_pos_so_far - $del_pos; | |
9116 my $pos_before_deletion = $op - $pos_after_deletion; | |
9117 if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";} | |
9118 | |
9119 $MD_tag .= "${pos_before_deletion}^${deleted_bases}"; | |
9120 $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}"; | |
9121 | |
9122 #adjusting the MD_position by the number of bases after the deletion | |
9123 $MD_pos_so_far -= $pos_after_deletion; | |
9124 if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; } | |
9125 | |
9126 $deletions_processed += 1; | |
9127 $this_deletion_processed = 1; | |
9128 | |
9129 if ($deletions_processed == $deletions_total){ # this was the last deletion of the read | |
9130 if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";} | |
9131 $MD_tag .= "${pos_after_deletion}"; | |
9132 | |
9133 } | |
9134 else{ | |
9135 if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";} | |
9136 if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";} | |
9137 | |
9138 $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op; | |
9139 if ($verbose){ warn "Current index = $current_md_index\n";} | |
9140 | |
9141 if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";} | |
9142 # since we are no longer in the loop we don't have to subtract 1 from $current_md_index (tit hasn't been incremented in the first place...) | |
9143 $md_index_already_processed = $current_md_index; | |
9144 | |
9145 if ($verbose){ warn "Exiting now and waiting for the next deletion\n";} | |
9146 | |
9147 $MD_pos_so_far += $len[$index]; | |
9148 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} | |
9149 } | |
9150 ### Finally also adding the length of the deletion to $del_pos | |
9151 $del_pos += $len[$index]; | |
9152 if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} | |
9153 } | |
9154 else{ | |
9155 die "Something went wrong, we haven't seen a deletion so far even though we should have...\n\n"; | |
9156 } | |
9157 } | |
9158 | |
9159 # forming a new @md | |
9160 @md = split //,$new_MD; | |
9161 $new_MD = ''; | |
9162 if ($verbose){ warn "New \@md array: @md\n\n";} | |
9163 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\nnew_MD so far: $new_MD\n\n";} | |
9164 | |
9165 } | |
9166 else{ | |
9167 die "Found CIGAR operations other than M, I, D or N: '$ops[$index]'. Not allowed at the moment\n"; | |
9168 } | |
9169 } | |
9170 | |
9171 } | |
9172 if ($verbose){ warn "Returning MD-tag: $MD_tag\n";} | |
9173 return $MD_tag; | |
9174 | |
9175 } | |
9176 | |
9177 ### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might | |
9178 ### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels | |
9179 # sub make_mismatch_string{ | |
9180 # my $actual_seq = shift or die "Missing actual sequence\n"; | |
9181 # my $ref_seq = shift or die "Missing reference sequence\n"; | |
9182 # my $XX_tag = "XX:Z:"; | |
9183 | |
9184 # my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison | |
9185 | |
9186 # warn "'$tmp'\n"; sleep(1); | |
9187 # my $prev_mm_pos = 0; | |
9188 | |
9189 # while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference | |
9190 # my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch | |
9191 # my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read | |
9192 # $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other | |
9193 # $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation) | |
9194 # $prev_mm_pos = pos($tmp); # Position of last mismatch | |
9195 # } | |
9196 # my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence | |
9197 # $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence | |
9198 # return $XX_tag; | |
9199 # } | |
9200 | |
9201 | |
9202 | |
9203 sub print_helpfile{ | |
9204 print << "HOW_TO"; | |
9205 | |
9206 | |
9207 This program is free software: you can redistribute it and/or modify | |
9208 it under the terms of the GNU General Public License as published by | |
9209 the Free Software Foundation, either version 3 of the License, or | |
9210 (at your option) any later version. | |
9211 | |
9212 This program is distributed in the hope that it will be useful, | |
9213 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
9214 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
9215 GNU General Public License for more details. | |
9216 You should have received a copy of the GNU General Public License | |
9217 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
9218 | |
9219 | |
9220 | |
9221 DESCRIPTION | |
9222 | |
9223 | |
9224 The following is a brief description of command line options and arguments to control the Bismark | |
9225 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the | |
9226 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand | |
9227 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). | |
9228 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome | |
9229 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the | |
9230 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) | |
9231 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original | |
9232 sequence from the genome and determine if there were any protected C's present or not. | |
9233 | |
9234 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be | |
9235 re-enabled by using --non_directional. | |
9236 | |
9237 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old | |
9238 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best | |
9239 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below. | |
9240 | |
9241 | |
9242 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} | |
9243 | |
9244 | |
9245 ARGUMENTS: | |
9246 | |
9247 <genome_folder> The path to the folder containing the unmodified reference genome | |
9248 as well as the subfolders created by the Bismark_Genome_Preparation | |
9249 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/). | |
9250 Bismark expects one or more fastA files in this folder (file extension: .fa | |
9251 or .fasta). The path can be relative or absolute. | |
9252 | |
9253 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes | |
9254 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must | |
9255 correspond file-for-file and read-for-read with those specified in <mates2>. | |
9256 Reads may be a mix of different lengths. Bismark will produce one mapping result | |
9257 and one report file per paired-end input file pair. | |
9258 | |
9259 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes | |
9260 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must | |
9261 correspond file-for-file and read-for-read with those specified in <mates1>. | |
9262 Reads may be a mix of different lengths. | |
9263 | |
9264 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g. | |
9265 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will | |
9266 produce one mapping result and one report file per input file. | |
9267 | |
9268 | |
9269 OPTIONS: | |
9270 | |
9271 | |
9272 Input: | |
9273 | |
9274 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ | |
9275 files (usually having extension .fg or .fastq). This is the default. See also | |
9276 --solexa-quals. | |
9277 | |
9278 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA | |
9279 files (usually havin extension .fa, .mfa, .fna or similar). All quality values | |
9280 are assumed to be 40 on the Phred scale. FASTA files are expected to contain both | |
9281 the read name and the sequence on a single line (and not spread over several lines). | |
9282 | |
9283 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input. | |
9284 | |
9285 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit. | |
9286 | |
9287 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. | |
9288 | |
9289 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. | |
9290 | |
9291 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled | |
9292 (which can't). The formula for conversion is: | |
9293 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This | |
9294 is usually the right option for use with (unconverted) reads emitted by the GA | |
9295 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. | |
9296 | |
9297 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) | |
9298 reads emitted by GA Pipeline version 1.3 or later. Default: off. | |
9299 | |
9300 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not | |
9301 specified it is assumed that Bowtie (1 or 2) is in the PATH. | |
9302 | |
9303 | |
9304 Alignment: | |
9305 | |
9306 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs | |
9307 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the | |
9308 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). | |
9309 | |
9310 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to | |
9311 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for | |
9312 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). | |
9313 | |
9314 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout | |
9315 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds | |
9316 quality values to the nearest 10 and saturates at 30. This value is not relevant for | |
9317 Bowtie 2. | |
9318 | |
9319 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in | |
9320 --best mode. Best-first search must keep track of many paths at once to ensure it is | |
9321 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the | |
9322 memory impact of the descriptors, but they can still grow very large in some cases. If | |
9323 you receive an error message saying that chunk memory has been exhausted in --best mode, | |
9324 try adjusting this parameter up to dedicate more memory to the descriptors. This value | |
9325 is not relevant for Bowtie 2. Default: 512. | |
9326 | |
9327 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and | |
9328 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation | |
9329 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also | |
9330 satisfied). A 19-bp gap would not be valid in that case. Default: 0. | |
9331 | |
9332 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and | |
9333 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a | |
9334 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). | |
9335 A 61-bp gap would not be valid in that case. Default: 500. | |
9336 | |
9337 --multicore <int> Sets the number of parallel instances of Bismark to be run concurrently. This forks the | |
9338 Bismark alignment step very early on so that each individual Spawn of Bismark processes | |
9339 only every n-th sequence (n being set by --multicore). Once all processes have completed, | |
9340 the individual BAM files, mapping reports, unmapped or ambiguous FastQ files are merged | |
9341 into single files in very much the same way as they would have been generated running Bismark | |
9342 conventionally with only a single instance. | |
9343 | |
9344 If system resources are plentiful this is a viable option to speed up the alignment process | |
9345 (we observed a near linear speed increase for up to --multicore 8 tested). However, please note | |
9346 that a typical Bismark run will use several cores already (Bismark itself, 2 or 4 threads of | |
9347 Bowtie/Bowtie2, Samtools, gzip etc...) and ~10-16GB of memory depending on the choice of aligner | |
9348 and genome. WARNING: Bismark Parallel (BP?) is resource hungry! Each value of --multicore specified | |
9349 will effectively lead to a linear increase in compute and memory requirements, so --multicore 4 for | |
9350 e.g. the GRCm38 mouse genome will probably use ~20 cores and eat ~40GB or RAM, but at the same time | |
9351 reduce the alignment time to ~25-30%. You have been warned. | |
9352 | |
9353 | |
9354 | |
9355 Bowtie 1 Reporting: | |
9356 | |
9357 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option | |
9358 will be used by default. | |
9359 | |
9360 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum | |
9361 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in | |
9362 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred | |
9363 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both | |
9364 have Phred quality 10. When --best is not specified, Bowtie may report alignments that | |
9365 are sub-optimal in terms of stratum and/or quality (though an effort is made to report | |
9366 the best alignment). --best mode also removes all strand bias. Note that --best does not | |
9367 affect which alignments are considered "valid" by Bowtie, only which valid alignments | |
9368 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified. | |
9369 Default: on. | |
9370 | |
9371 --no_best Disables the --best option which is on by default. This can speed up the alignment process, | |
9372 e.g. for testing purposes, but for credible results it is not recommended to disable --best. | |
9373 | |
9374 | |
9375 Output: | |
9376 | |
9377 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four | |
9378 bisulfite strands will be reported. Default: OFF. | |
9379 | |
9380 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary | |
9381 to the original strands are merely theoretical and should not exist in reality. Specifying directional | |
9382 alignments (which is the default) will only run 2 alignment threads to the original top (OT) | |
9383 or bottom (OB) strands in parallel and report these alignments. This is the recommended option | |
9384 for sprand-specific libraries). | |
9385 | |
9386 --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al., | |
9387 PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode, | |
9388 as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT | |
9389 and OB ones. Use this option only if you are certain that your libraries were constructed following | |
9390 a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option | |
9391 --pbat works only for FastQ files (in both Bowtie and Bowtie 2 mode) and using uncompressed | |
9392 temporary files only). | |
9393 | |
9394 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are | |
9395 split up into several smaller files to run concurrently and the output files are to be merged. | |
9396 | |
9397 --quiet Print nothing besides alignments. | |
9398 | |
9399 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead | |
9400 of SAM format output. | |
9401 | |
9402 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will | |
9403 appear as they did in the input, without any translation of quality values that may have | |
9404 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 | |
9405 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads | |
9406 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) | |
9407 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. | |
9408 | |
9409 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest | |
9410 mismatches or other reads that fail to align uniquely to a file in the output directory. | |
9411 Written reads will appear as they did in the input, without any of the translation of quality | |
9412 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two | |
9413 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and | |
9414 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. | |
9415 | |
9416 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into | |
9417 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt | |
9418 to create it first. The path to the output folder can be either relative or absolute. | |
9419 | |
9420 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If | |
9421 the specified folder does not exist, Bismark will attempt to create it first. The path to the | |
9422 temporary folder can be either relative or absolute. | |
9423 | |
9424 --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the | |
9425 alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is | |
9426 just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions. | |
9427 The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches' | |
9428 and 'XB:Z:number of mismatches' for read 2 of paired-end reads. | |
9429 | |
9430 --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk | |
9431 space. This option is available for most alignment modes but is not available for paired-end FastA | |
9432 files. This option might be somewhat slower than writing out uncompressed files, but this awaits | |
9433 further testing. | |
9434 | |
9435 --sam The output will be written out in SAM format instead of the default BAM format. Bismark will | |
9436 attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't | |
9437 been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found, | |
9438 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). | |
9439 | |
9440 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified | |
9441 explicitly if Samtools is in the PATH already. | |
9442 | |
9443 --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For | |
9444 example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. | |
9445 | |
9446 -B/--basename <basename> Write all output to files starting with this base file name. For example, '--basename foo' | |
9447 would result in the files 'foo.sam' and 'foo_SE_report.txt' (or its paired-end equivalent). Takes | |
9448 precedence over --prefix. | |
9449 | |
9450 --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, | |
9451 this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since | |
9452 both the appended read IDs and custom FLAG values may cause problems with some downstream tools | |
9453 such as Picard, new defaults were implemented as of version 0.8.3. | |
9454 | |
9455 | |
9456 default old_flag | |
9457 =================== =================== | |
9458 Read 1 Read 2 Read 1 Read 2 | |
9459 | |
9460 OT: 99 147 67 131 | |
9461 | |
9462 OB: 83 163 115 179 | |
9463 | |
9464 CTOT: 99 147 67 131 | |
9465 | |
9466 CTOB: 83 163 115 179 | |
9467 | |
9468 | |
9469 Other: | |
9470 | |
9471 -h/--help Displays this help file. | |
9472 | |
9473 -v/--version Displays version information. | |
9474 | |
9475 | |
9476 BOWTIE 2 SPECIFIC OPTIONS | |
9477 | |
9478 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end | |
9479 alignments, i.e. searches for alignments involving all read characters (also called | |
9480 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter | |
9481 and/or quality trimmed where appropriate. Both small (.bt2) and large (.bt2l) Bowtie 2 | |
9482 indexes are supported. Default: off. | |
9483 | |
9484 Bowtie 2 alignment options: | |
9485 | |
9486 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. | |
9487 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) | |
9488 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for | |
9489 Bowtie 1 see -n). | |
9490 | |
9491 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values | |
9492 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is | |
9493 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for | |
9494 Bowtie 1 see -l). | |
9495 | |
9496 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched | |
9497 position to be the highest possible, regardless of the actual value. I.e. input is treated | |
9498 as though all quality values are high. This is also the default behavior when the input | |
9499 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. | |
9500 | |
9501 | |
9502 Bowtie 2 paired-end options: | |
9503 | |
9504 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if | |
9505 it cannot find a concordant or discordant alignment for a pair. This option is invariable and | |
9506 and on by default. | |
9507 | |
9508 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. | |
9509 A discordant alignment is an alignment where both mates align uniquely, but that does not | |
9510 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior | |
9511 and it is on by default. | |
9512 | |
9513 | |
9514 Bowtie 2 effort options: | |
9515 | |
9516 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using | |
9517 the alignments found so far. A seed extension "fails" if it does not yield a new best or a | |
9518 new second-best alignment. Default: 15. | |
9519 | |
9520 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. | |
9521 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of | |
9522 mismatches allowed) at different offsets and searches for more alignments. A read is considered | |
9523 to have repetitive seeds if the total number of seed hits divided by the number of seeds | |
9524 that aligned at least once is greater than 300. Default: 2. | |
9525 | |
9526 Bowtie 2 parallelization options: | |
9527 | |
9528 | |
9529 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores | |
9530 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly | |
9531 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. | |
9532 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint | |
9533 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads | |
9534 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will | |
9535 automatically use the option '--reorder', which guarantees that output SAM records are printed in | |
9536 an order corresponding to the order of the reads in the original input file, even when -p is set | |
9537 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and | |
9538 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then | |
9539 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally | |
9540 correspond to input order in that case. | |
9541 | |
9542 Bowtie 2 Scoring options: | |
9543 | |
9544 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered | |
9545 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying | |
9546 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. | |
9547 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is | |
9548 L,0,-0.2. | |
9549 | |
9550 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty | |
9551 of <int1> + N * <int2>. Default: 5, 3. | |
9552 | |
9553 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets | |
9554 a penalty of <int1> + N * <int2>. Default: 5, 3. | |
9555 | |
9556 | |
9557 Bowtie 2 Reporting options: | |
9558 | |
9559 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is | |
9560 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the | |
9561 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the | |
9562 effort expended to find valid alignments. | |
9563 | |
9564 For reference, this used to be the old (now deprecated) description of -M: | |
9565 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it | |
9566 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever | |
9567 happens first. Only the best alignment is reported. Information from the other alignments is used to | |
9568 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes | |
9569 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that | |
9570 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not | |
9571 guarantee that the alignment reported is the best possible in terms of alignment score. -M is | |
9572 always used and its default value is set to 10. | |
9573 | |
9574 | |
9575 'VANILLA' Bismark OUTPUT: | |
9576 | |
9577 Single-end output format (tab-separated): | |
9578 | |
9579 (1) <seq-ID> | |
9580 (2) <read alignment strand> | |
9581 (3) <chromosome> | |
9582 (4) <start position> | |
9583 (5) <end position> | |
9584 (6) <observed bisulfite sequence> | |
9585 (7) <equivalent genomic sequence> | |
9586 (8) <methylation call> | |
9587 (9) <read conversion | |
9588 (10) <genome conversion> | |
9589 (11) <read quality score (Phred33)> | |
9590 | |
9591 | |
9592 Paired-end output format (tab-separated): | |
9593 (1) <seq-ID> | |
9594 (2) <read 1 alignment strand> | |
9595 (3) <chromosome> | |
9596 (4) <start position> | |
9597 (5) <end position> | |
9598 (6) <observed bisulfite sequence 1> | |
9599 (7) <equivalent genomic sequence 1> | |
9600 (8) <methylation call 1> | |
9601 (9) <observed bisulfite sequence 2> | |
9602 (10) <equivalent genomic sequence 2> | |
9603 (11) <methylation call 2> | |
9604 (12) <read 1 conversion | |
9605 (13) <genome conversion> | |
9606 (14) <read 1 quality score (Phred33)> | |
9607 (15) <read 2 quality score (Phred33)> | |
9608 | |
9609 | |
9610 Bismark SAM OUTPUT (default): | |
9611 | |
9612 (1) QNAME (seq-ID) | |
9613 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!)) | |
9614 (3) RNAME (chromosome) | |
9615 (4) POS (start position) | |
9616 (5) MAPQ (always 255 for use with Bowtie) | |
9617 (6) CIGAR | |
9618 (7) RNEXT | |
9619 (8) PNEXT | |
9620 (9) TLEN | |
9621 (10) SEQ | |
9622 (11) QUAL (Phred33 scale) | |
9623 (12) NM-tag (edit distance to the reference) | |
9624 (13) MD-tag (base-by-base mismatches to the reference (handles indels) | |
9625 (14) XM-tag (methylation call string) | |
9626 (15) XR-tag (read conversion state for the alignment) | |
9627 (16) XG-tag (genome conversion state for the alignment) | |
9628 (17) XA/XB-tag (non-bisulfite mismatches) (optional!) | |
9629 | |
9630 Each read of paired-end alignments is written out in a separate line in the above format. | |
9631 | |
9632 | |
9633 Last edited on 06 May 2015. | |
9634 | |
9635 HOW_TO | |
9636 } |