Mercurial > repos > bgruening > bismark
comparison bismark_mapping/bismark @ 7:fcadce4d9a06 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit b'e6ee273f75fff61d1e419283fa8088528cf59470\n'
author | bgruening |
---|---|
date | Sat, 06 May 2017 13:18:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:0f8646f22b8d | 7:fcadce4d9a06 |
---|---|
1 #!/usr/bin/env perl | |
2 use strict; | |
3 use warnings; | |
4 use IO::Handle; | |
5 use Cwd; | |
6 $|++; | |
7 use Getopt::Long; | |
8 use FindBin qw($Bin); | |
9 use lib "$Bin/../lib"; | |
10 | |
11 ## This program is Copyright (C) 2010-16, Felix Krueger (felix.krueger@babraham.ac.uk) | |
12 | |
13 ## This program is free software: you can redistribute it and/or modify | |
14 ## it under the terms of the GNU General Public License as published by | |
15 ## the Free Software Foundation, either version 3 of the License, or | |
16 ## (at your option) any later version. | |
17 | |
18 ## This program is distributed in the hope that it will be useful, | |
19 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 ## GNU General Public License for more details. | |
22 | |
23 ## You should have received a copy of the GNU General Public License | |
24 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
25 | |
26 | |
27 my $parent_dir = getcwd; | |
28 my $bismark_version = 'v0.16.3'; | |
29 my $command_line = join (" ",@ARGV); | |
30 | |
31 | |
32 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail | |
33 foreach my $arg (@ARGV){ | |
34 if ($arg eq '--solexa1.3-quals'){ | |
35 $arg = '--phred64-quals'; | |
36 } | |
37 } | |
38 my @filenames; # will be populated by processing the command line | |
39 | |
40 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index,$multicore,$rg_tag,$rg_id,$rg_sample,$ambig_bam,$cram,$cram_ref,$nucleotide_coverage,$dovetail) = process_command_line(); | |
41 | |
42 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment | |
43 my %chromosomes; # stores the chromosome sequences of the mouse genome | |
44 my %SQ_order; # stores the order of sequences in the reference. This is to produce SAM/BAM files with a known order of chromosomes | |
45 my %counting; # counting various events | |
46 my $final_output_filename; # required for the nucleotide coverage report | |
47 my @pids; # storing the process IDs of child processes in parallel mode | |
48 | |
49 | |
50 my $seqID_contains_tabs; | |
51 my $verbose = 0; | |
52 | |
53 if ($multicore > 1){ | |
54 warn "Running Bismark Parallel version. Number of parallel instances to be spawned: $multicore\n\n"; | |
55 } | |
56 | |
57 | |
58 sub multi_process_handling{ | |
59 | |
60 my $offset = 1; | |
61 my $process_id; | |
62 if ($multicore > 1){ | |
63 | |
64 until ($offset == $multicore){ | |
65 # warn "multicore: $multicore\noffset: $offset\n"; | |
66 my $fork = fork; | |
67 | |
68 if (defined $fork){ | |
69 if ($fork != 0){ | |
70 $process_id = $fork; | |
71 push @pids, $process_id; | |
72 if ($offset < $multicore){ | |
73 ++$offset; | |
74 # warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n"; | |
75 } | |
76 else{ | |
77 # warn "Reached the number of maximum multicores. Proceeeding to processing...\n"; | |
78 } | |
79 } | |
80 elsif ($fork == 0){ | |
81 # warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n"; | |
82 $process_id = $fork; | |
83 last; | |
84 } | |
85 } | |
86 else{ | |
87 die "Forking unsuccessful. Proceeding using a single thread only\n"; | |
88 } | |
89 } | |
90 | |
91 # warn "\nThe Thread Identity\n===================\n"; | |
92 if ($process_id){ | |
93 # print "I am the parent process. My children are called:\n"; | |
94 # print join ("\t",@pids),"\n"; | |
95 # print "I am going to process the following line count: $offset\n\n"; | |
96 } | |
97 elsif($process_id == 0){ | |
98 # warn "I am a child process: Process ID: $process_id\n"; | |
99 # warn "I am going to process the following line count: $offset\n\n"; | |
100 } | |
101 else{ | |
102 die "Process ID was: '$process_id'\n"; | |
103 } | |
104 } | |
105 else{ | |
106 warn "Single-core mode: setting pid to 1\n"; | |
107 $process_id = 1; | |
108 } | |
109 | |
110 return ($process_id,$offset); | |
111 } | |
112 | |
113 | |
114 sub subset_input_file_FastQ{ | |
115 | |
116 my ($filename,$process_id,$offset) = @_; | |
117 | |
118 if ($filename =~ /gz$/){ | |
119 open (OFFSET,"gunzip -c $filename |") or die "Couldn't read from file '$filename': $!\n"; | |
120 } | |
121 else{ | |
122 open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n"; | |
123 } | |
124 | |
125 # warn "offset is $offset\n"; | |
126 my $temp = $filename; | |
127 $temp .= ".temp.$offset"; | |
128 $temp =~ s/^.*\///; # replacing everything upto and including the last /, i.e. removing file path information | |
129 | |
130 if ($gzip){ | |
131 $temp .= '.gz'; | |
132 open (TEMPFQ,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n"; | |
133 } | |
134 else{ | |
135 open (TEMPFQ,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n"; | |
136 } | |
137 | |
138 my $line_count = 0; | |
139 | |
140 while (1){ | |
141 my $l1 = <OFFSET>; | |
142 my $l2 = <OFFSET>; | |
143 my $l3 = <OFFSET>; | |
144 my $l4 = <OFFSET>; | |
145 | |
146 last unless ($l4); | |
147 ++$line_count; | |
148 | |
149 if ( ($line_count - $offset)%$multicore == 0){ | |
150 # warn "line count: $line_count\noffset: $offset\n"; | |
151 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; | |
152 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; | |
153 print TEMPFQ "$l1$l2$l3$l4"; | |
154 } | |
155 else{ | |
156 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; | |
157 next; | |
158 } | |
159 } | |
160 | |
161 close OFFSET or warn $!; | |
162 close TEMPFQ or warn "Failed to close file handle TEMPFQ: $!\n"; | |
163 | |
164 warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n"; | |
165 | |
166 return ($temp); # returning the subset filename | |
167 | |
168 } | |
169 | |
170 sub subset_input_file_FastA{ | |
171 | |
172 my ($filename,$process_id,$offset) = @_; | |
173 | |
174 if ($filename =~ /gz$/){ | |
175 open (OFFSET,"gunzip -c $filename |") or die "Couldn't read from file '$filename': $!\n"; | |
176 } | |
177 else{ | |
178 open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n"; | |
179 } | |
180 | |
181 # warn "offset is $offset\n"; | |
182 my $temp = $filename; | |
183 $temp .= ".temp.$offset"; | |
184 $temp =~ s/^.*\///; # replacing everything upto and including the last /, i.e. removing file path information | |
185 | |
186 if ($gzip){ | |
187 $temp .= '.gz'; | |
188 open (TEMPFA,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n"; | |
189 } | |
190 else{ | |
191 open (TEMPFA,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n"; | |
192 } | |
193 | |
194 warn "Writing temporary infile to $temp\n"; | |
195 | |
196 my $line_count = 0; | |
197 | |
198 while (1){ | |
199 my $l1 = <OFFSET>; | |
200 my $l2 = <OFFSET>; | |
201 | |
202 last unless ($l2); | |
203 ++$line_count; | |
204 | |
205 if ( ($line_count - $offset)%$multicore == 0){ | |
206 # warn "line count: $line_count\noffset: $offset\n"; | |
207 # warn "Modulus: ",($line_count - $offset)%$multicore,"\n"; | |
208 # warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n"; | |
209 print TEMPFA "$l1$l2"; | |
210 } | |
211 else{ | |
212 # warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n"; | |
213 next; | |
214 } | |
215 } | |
216 | |
217 close OFFSET or warn $!; | |
218 close TEMPFA or warn "Failed to close file handle TEMPFQ: $!\n"; | |
219 | |
220 warn "Finished subdividing $filename for PID: $process_id and offset $offset\n\n"; | |
221 | |
222 return ($temp); # returning the subset filename | |
223 | |
224 } | |
225 | |
226 ##### | |
227 ##### | |
228 | |
229 foreach my $filename (@filenames){ | |
230 | |
231 my $original_filename = $filename; | |
232 my $original_filename_1; | |
233 my $original_filename_2; | |
234 | |
235 chdir $parent_dir or die "Unable to move to initial working directory'$parent_dir' $!\n"; | |
236 ### resetting the counting hash and fhs | |
237 reset_counters_and_fhs($filename); | |
238 @pids = (); | |
239 $seqID_contains_tabs = 0; | |
240 | |
241 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time | |
242 unless (%chromosomes){ | |
243 my $cwd = getcwd; # storing the path of the current working directory | |
244 warn "Current working directory is: $cwd\n\n"; | |
245 read_genome_into_memory($cwd); | |
246 } | |
247 | |
248 ### As of version 0.14.0 we support multi-threading. In a first instance we accomplish this by | |
249 ### splitting the input file(s) into several smaller subfiles and merging the results back at | |
250 ### the end. | |
251 | |
252 # get general settings (also for single-threaded use) | |
253 my ($pid,$offset) = multi_process_handling (); | |
254 | |
255 my ($single_end,$paired_end); | |
256 ### PAIRED-END ALIGNMENTS | |
257 if ($filename =~ ','){ | |
258 | |
259 $single_end = 0; | |
260 $paired_end = 1; | |
261 | |
262 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file | |
263 | |
264 $fhs[0]->{name} = 'CTread1GAread2CTgenome'; | |
265 $fhs[1]->{name} = 'GAread1CTread2GAgenome'; | |
266 $fhs[2]->{name} = 'GAread1CTread2CTgenome'; | |
267 $fhs[3]->{name} = 'CTread1GAread2GAgenome'; | |
268 warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n"; | |
269 | |
270 my ($filename_1,$filename_2) = (split (/,/,$filename)); | |
271 $original_filename_1 = $filename_1; | |
272 $original_filename_2 = $filename_2; | |
273 | |
274 warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n"; | |
275 | |
276 ### subsetting the input file(s) | |
277 unless ($multicore == 1){ # not needed in single-core mode | |
278 # warn "My PID: $pid\nMy offset: $offset\n"; | |
279 if ($sequence_file_format eq 'FASTA'){ | |
280 my $temp_filename_1 = subset_input_file_FastA($filename_1,$pid,$offset); | |
281 warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n"; | |
282 $filename_1 = "${temp_dir}$temp_filename_1"; | |
283 | |
284 my $temp_filename_2 = subset_input_file_FastA($filename_2,$pid,$offset); | |
285 warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n"; | |
286 $filename_2 = "${temp_dir}$temp_filename_2"; | |
287 } | |
288 else{ # FastQ format, default | |
289 my $temp_filename_1 = subset_input_file_FastQ($filename_1,$pid,$offset); | |
290 warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n"; | |
291 $filename_1 = "${temp_dir}$temp_filename_1"; | |
292 | |
293 my $temp_filename_2 = subset_input_file_FastQ($filename_2,$pid,$offset); | |
294 warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n"; | |
295 $filename_2 = "${temp_dir}$temp_filename_2"; | |
296 } | |
297 } | |
298 | |
299 ### additional variables only for paired-end alignments | |
300 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file | |
301 | |
302 ### FastA format | |
303 if ($sequence_file_format eq 'FASTA'){ | |
304 warn "Input files are in FastA format\n"; | |
305 | |
306 if ($directional){ | |
307 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number | |
308 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); | |
309 | |
310 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
311 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
312 $fhs[1]->{inputfile_1} = undef; | |
313 $fhs[1]->{inputfile_2} = undef; | |
314 $fhs[2]->{inputfile_1} = undef; | |
315 $fhs[2]->{inputfile_2} = undef; | |
316 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
317 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
318 } | |
319 else{ | |
320 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number | |
321 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2); | |
322 | |
323 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
324 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
325 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
326 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
327 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
328 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
329 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
330 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
331 } | |
332 | |
333 if ($bowtie2){ | |
334 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
335 } | |
336 else{ | |
337 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
338 } | |
339 } | |
340 | |
341 ### FastQ format | |
342 else{ | |
343 warn "Input files are in FastQ format\n"; | |
344 if ($directional){ | |
345 if ($bowtie2){ | |
346 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
347 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
348 | |
349 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
350 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
351 $fhs[1]->{inputfile_1} = undef; | |
352 $fhs[1]->{inputfile_2} = undef; | |
353 $fhs[2]->{inputfile_1} = undef; | |
354 $fhs[2]->{inputfile_2} = undef; | |
355 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
356 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
357 } | |
358 else{ # Bowtie 1 alignments | |
359 if ($gzip){ | |
360 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time | |
361 | |
362 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format | |
363 $fhs[0]->{inputfile_2} = undef; # no longer needed | |
364 $fhs[1]->{inputfile_1} = undef; | |
365 $fhs[1]->{inputfile_2} = undef; | |
366 $fhs[2]->{inputfile_1} = undef; | |
367 $fhs[2]->{inputfile_2} = undef; | |
368 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; # this file contains both read 1 and read 2 in tab delimited format | |
369 $fhs[3]->{inputfile_2} = undef; # no longer needed | |
370 } | |
371 else{ | |
372 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
373 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
374 | |
375 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
376 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
377 $fhs[1]->{inputfile_1} = undef; | |
378 $fhs[1]->{inputfile_2} = undef; | |
379 $fhs[2]->{inputfile_1} = undef; | |
380 $fhs[2]->{inputfile_2} = undef; | |
381 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
382 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
383 } | |
384 } | |
385 } | |
386 elsif($pbat){ # PBAT-Seq. This works for both Bowtie and Bowtie 2 | |
387 ### At the moment we are only performing alignments only with uncompressed FastQ files | |
388 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
389 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
390 | |
391 $fhs[0]->{inputfile_1} = undef; | |
392 $fhs[0]->{inputfile_2} = undef; | |
393 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
394 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
395 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
396 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
397 $fhs[3]->{inputfile_1} = undef; | |
398 $fhs[3]->{inputfile_2} = undef; | |
399 } | |
400 else{ | |
401 if ($bowtie2){ | |
402 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
403 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
404 | |
405 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
406 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
407 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
408 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
409 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
410 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
411 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
412 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
413 } | |
414 else{ # Bowtie 1 alignments | |
415 if ($gzip){ | |
416 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end_bowtie1_gzip ($filename_1,$filename_2); # passing both reads at the same time | |
417 | |
418 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
419 $fhs[0]->{inputfile_2} = undef; # not needed for compressed temp files | |
420 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
421 $fhs[1]->{inputfile_2} = undef; | |
422 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
423 $fhs[2]->{inputfile_2} = undef; | |
424 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
425 $fhs[3]->{inputfile_2} = undef; # not needed for compressed temp files | |
426 } | |
427 else{ # uncompressed temp files | |
428 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number | |
429 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2); | |
430 | |
431 $fhs[0]->{inputfile_1} = $C_to_T_infile_1; | |
432 $fhs[0]->{inputfile_2} = $G_to_A_infile_2; | |
433 $fhs[1]->{inputfile_1} = $G_to_A_infile_1; | |
434 $fhs[1]->{inputfile_2} = $C_to_T_infile_2; | |
435 $fhs[2]->{inputfile_1} = $G_to_A_infile_1; | |
436 $fhs[2]->{inputfile_2} = $C_to_T_infile_2; | |
437 $fhs[3]->{inputfile_1} = $C_to_T_infile_1; | |
438 $fhs[3]->{inputfile_2} = $G_to_A_infile_2; | |
439 } | |
440 } | |
441 } | |
442 if ($bowtie2){ | |
443 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
444 } | |
445 else{ | |
446 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2); | |
447 } | |
448 } | |
449 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
450 } | |
451 | |
452 ### Else we are performing SINGLE-END ALIGNMENTS | |
453 else{ | |
454 warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n"; | |
455 | |
456 $single_end = 1; | |
457 $paired_end = 0; | |
458 | |
459 ### subsetting the input file(s) | |
460 unless ($multicore == 1){ # not needed in single-core mode | |
461 # warn "My PID: $pid\nMy offset: $offset\n"; | |
462 if ($sequence_file_format eq 'FASTA'){ | |
463 my $temp_filename = subset_input_file_FastA($filename,$pid,$offset); | |
464 warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n"; | |
465 $filename = "${temp_dir}$temp_filename"; | |
466 } | |
467 else{ # FastQ format, default | |
468 my $temp_filename = subset_input_file_FastQ($filename,$pid,$offset); | |
469 warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n"; | |
470 $filename = "${temp_dir}$temp_filename"; | |
471 } | |
472 } | |
473 | |
474 ### Initialising bisulfite conversion filenames | |
475 my ($C_to_T_infile,$G_to_A_infile); | |
476 | |
477 ### FastA format | |
478 if ($sequence_file_format eq 'FASTA'){ | |
479 warn "Input file is in FastA format\n"; | |
480 if ($directional){ | |
481 ($C_to_T_infile) = biTransformFastAFiles ($filename); | |
482 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
483 } | |
484 else{ | |
485 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename); | |
486 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
487 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; | |
488 } | |
489 | |
490 ### Creating 4 different bowtie filehandles and storing the first entry | |
491 if ($bowtie2){ | |
492 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile); | |
493 } | |
494 else{ | |
495 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile); | |
496 } | |
497 } | |
498 | |
499 ## FastQ format | |
500 else{ | |
501 warn "Input file is in FastQ format\n"; | |
502 if ($directional){ | |
503 ($C_to_T_infile) = biTransformFastQFiles ($filename); | |
504 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
505 } | |
506 elsif($pbat){ | |
507 ($G_to_A_infile) = biTransformFastQFiles ($filename); | |
508 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files | |
509 } | |
510 else{ | |
511 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename); | |
512 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile; | |
513 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile; | |
514 } | |
515 | |
516 ### Creating up to 4 different bowtie filehandles and storing the first entry | |
517 if ($pbat){ | |
518 if ($bowtie2){ # as of version 0.10.2 we also support PBAT alignments for Bowtie 2 | |
519 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 (undef,$G_to_A_infile); | |
520 } | |
521 else{ | |
522 single_end_align_fragments_to_bisulfite_genome_fastQ (undef,$G_to_A_infile); | |
523 } | |
524 } | |
525 elsif ($bowtie2){ | |
526 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile); | |
527 } | |
528 else{ | |
529 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile); | |
530 } | |
531 } | |
532 | |
533 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile,$pid); | |
534 | |
535 } | |
536 | |
537 ### MERGING AND DELETING TEMP FILES // TIDYING UP AFTER A MULTICORE PROCESS | |
538 | |
539 if ($pid){ # only performing this for the parent process | |
540 | |
541 if ($multicore > 1){ | |
542 | |
543 warn "Now waiting for all child processes to complete\n"; | |
544 | |
545 ### we need to ensure that we wait for all child processes to be finished before continuing | |
546 # warn "here are the child IDs: @pids\n"; | |
547 # warn "Looping through the child process IDs:\n"; | |
548 | |
549 foreach my $id (@pids){ | |
550 # print "$id\t"; | |
551 my $kid = waitpid ($id,0); | |
552 # print "Returned: $kid\nExit status: $?\n"; | |
553 unless ($? == 0){ | |
554 warn "\nChild process terminated with exit signal: '$?'\n\n"; | |
555 } | |
556 } | |
557 | |
558 # regenerating names for temporary files | |
559 my @temp_input; | |
560 my @temp_output; | |
561 my @temp_reports; | |
562 my @temp_unmapped_1; # will store single end reads or R1 of paired-end | |
563 my @temp_unmapped_2; | |
564 my @temp_ambiguous_1; # will store single end reads or R1 of paired-end | |
565 my @temp_ambiguous_2; | |
566 my @temp_ambig_bam; | |
567 | |
568 for (1..$offset){ | |
569 | |
570 # Temp Input Files | |
571 if ($single_end){ | |
572 if ($gzip){ | |
573 push @temp_input, "${original_filename}.temp.${_}.gz"; | |
574 } | |
575 else{ | |
576 push @temp_input, "${original_filename}.temp.${_}"; | |
577 } | |
578 | |
579 } | |
580 elsif($paired_end){ | |
581 if ($gzip){ | |
582 push @temp_input, "${original_filename_1}.temp.${_}.gz"; | |
583 push @temp_input, "${original_filename_2}.temp.${_}.gz"; | |
584 } | |
585 else{ | |
586 push @temp_input, "${original_filename_1}.temp.${_}"; | |
587 push @temp_input, "${original_filename_2}.temp.${_}"; | |
588 } | |
589 } | |
590 | |
591 # if files had a prefix we need to specify it | |
592 my $add_prefix; | |
593 if (defined $prefix){ | |
594 $add_prefix = "${prefix}."; | |
595 } | |
596 else{ | |
597 $add_prefix = ''; | |
598 } | |
599 | |
600 # Temp Output Files | |
601 if ($single_end){ | |
602 | |
603 if ($bowtie2){ | |
604 if ($gzip){ | |
605 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2.bam"; | |
606 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2_SE_report.txt"; | |
607 push @temp_ambig_bam, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_bt2.ambig.bam"; # only for Bowtie 2 | |
608 } | |
609 else{ | |
610 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2.bam"; | |
611 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2_SE_report.txt"; | |
612 push @temp_ambig_bam, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_bt2.ambig.bam"; # only for Bowtie 2 | |
613 } | |
614 } | |
615 else{ | |
616 if ($gzip){ | |
617 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark.bam"; | |
618 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_bismark_SE_report.txt"; | |
619 } | |
620 else{ | |
621 push @temp_output, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark.bam"; | |
622 push @temp_reports, "${output_dir}${add_prefix}${original_filename}.temp.${_}_bismark_SE_report.txt"; | |
623 } | |
624 } | |
625 | |
626 if ($unmapped){ | |
627 if ($gzip){ | |
628 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_unmapped_reads.fq"; | |
629 } | |
630 else{ | |
631 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_unmapped_reads.fq"; | |
632 } | |
633 } | |
634 | |
635 if ($ambiguous){ | |
636 if ($gzip){ | |
637 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}.gz_ambiguous_reads.fq"; | |
638 } | |
639 else{ | |
640 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename}.temp.${_}_ambiguous_reads.fq"; | |
641 } | |
642 } | |
643 | |
644 } | |
645 elsif($paired_end){ | |
646 if ($bowtie2){ | |
647 if ($gzip){ | |
648 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_pe.bam"; | |
649 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_PE_report.txt"; | |
650 push @temp_ambig_bam, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_bt2_pe.ambig.bam"; # only for Bowtie 2 | |
651 } | |
652 else{ | |
653 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_pe.bam"; | |
654 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_PE_report.txt"; | |
655 push @temp_ambig_bam, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_bt2_pe.ambig.bam"; # only for Bowtie 2 | |
656 } | |
657 } | |
658 else{ | |
659 if ($gzip){ | |
660 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_pe.bam"; | |
661 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_bismark_PE_report.txt"; | |
662 } | |
663 else{ | |
664 push @temp_output, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_pe.bam"; | |
665 push @temp_reports, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_bismark_PE_report.txt"; | |
666 } | |
667 } | |
668 | |
669 if ($unmapped){ | |
670 if ($gzip){ | |
671 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_unmapped_reads_1.fq"; | |
672 push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_unmapped_reads_2.fq"; | |
673 } | |
674 else{ | |
675 push @temp_unmapped_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_unmapped_reads_1.fq"; | |
676 push @temp_unmapped_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_unmapped_reads_2.fq"; | |
677 } | |
678 } | |
679 | |
680 if ($ambiguous){ | |
681 if ($gzip){ | |
682 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}.gz_ambiguous_reads_1.fq"; | |
683 push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}.gz_ambiguous_reads_2.fq"; | |
684 } | |
685 else{ | |
686 push @temp_ambiguous_1, "${output_dir}${add_prefix}${original_filename_1}.temp.${_}_ambiguous_reads_1.fq"; | |
687 push @temp_ambiguous_2, "${output_dir}${add_prefix}${original_filename_2}.temp.${_}_ambiguous_reads_2.fq"; | |
688 } | |
689 } | |
690 | |
691 } | |
692 } | |
693 | |
694 warn "\n\nRight, cleaning up now...\n\n"; | |
695 | |
696 # deleting temp files; | |
697 warn "Deleting temporary sequence files...\n"; | |
698 foreach my $temp (@temp_input){ | |
699 #print "$temp\t"; | |
700 $temp =~ s/.*\///; # deleting path information | |
701 print "${temp_dir}${temp}\t"; | |
702 unlink "${temp_dir}${temp}" or warn "Failed to delete temporary FastQ file ${temp_dir}$temp: $!\n"; | |
703 } | |
704 print "\n\n"; | |
705 | |
706 # merging temp BAM files | |
707 if ($single_end){ | |
708 merge_individual_BAM_files(\@temp_output,$original_filename,$single_end); | |
709 } | |
710 else{ | |
711 merge_individual_BAM_files(\@temp_output,$original_filename_1,$single_end); | |
712 } | |
713 | |
714 # deleting temp BAM files | |
715 warn "Deleting temporary BAM files...\n"; | |
716 foreach my $temp (@temp_output){ | |
717 # print "$temp\t"; | |
718 $temp =~ s/.*\///; # deleting path information | |
719 print "${output_dir}${temp}\t"; | |
720 unlink "${output_dir}${temp}" or warn "Failed to delete temporary BAM file ${output_dir}${temp}: $!\n"; | |
721 } | |
722 print "\n\n"; | |
723 | |
724 ### AMBIGUOUS BAM files | |
725 if ($ambig_bam){ | |
726 | |
727 # merging temp AMBIG BAM files | |
728 if ($single_end){ | |
729 merge_individual_ambig_BAM_files(\@temp_ambig_bam,$original_filename,$single_end); | |
730 } | |
731 else{ | |
732 merge_individual_ambig_BAM_files(\@temp_ambig_bam,$original_filename_1,$single_end); | |
733 } | |
734 | |
735 # deleting temp BAM files | |
736 warn "Deleting temporary ambiguous BAM files...\n"; | |
737 foreach my $temp (@temp_ambig_bam){ | |
738 # print "$temp\t"; | |
739 $temp =~ s/.*\///; # deleting path information | |
740 print "${output_dir}${temp}\t"; | |
741 unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous BAM file ${output_dir}${temp}: $!\n"; | |
742 } | |
743 print "\n\n"; | |
744 } | |
745 | |
746 if ($unmapped){ | |
747 if ($single_end){ | |
748 merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename,$single_end); | |
749 } | |
750 else{ | |
751 merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename_1,$single_end,'_1'); | |
752 merge_individual_unmapped_files(\@temp_unmapped_2,$original_filename_2,$single_end,'_2'); | |
753 } | |
754 | |
755 # deleting temp unmapped files | |
756 warn "Deleting temporary unmapped files...\n"; | |
757 foreach my $temp (@temp_unmapped_1){ | |
758 print "$temp\t"; | |
759 unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n"; | |
760 } | |
761 if ($paired_end){ | |
762 foreach my $temp (@temp_unmapped_2){ | |
763 print "$temp\t"; | |
764 unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n"; | |
765 } | |
766 } | |
767 print "\n\n"; | |
768 | |
769 } | |
770 | |
771 if ($ambiguous){ | |
772 if ($single_end){ | |
773 merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename,$single_end); | |
774 } | |
775 else{ | |
776 merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename_1,$single_end,'_1'); | |
777 merge_individual_ambiguous_files(\@temp_ambiguous_2,$original_filename_2,$single_end,'_2'); | |
778 } | |
779 | |
780 # deleting temp ambiguous files | |
781 warn "Deleting temporary ambiguous files...\n"; | |
782 foreach my $temp (@temp_ambiguous_1){ | |
783 print "$temp\t"; | |
784 unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n"; | |
785 } | |
786 | |
787 if ($paired_end){ | |
788 foreach my $temp (@temp_ambiguous_2){ | |
789 print "$temp\t"; | |
790 unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n"; | |
791 } | |
792 } | |
793 print "\n\n"; | |
794 } | |
795 | |
796 # resetting the counters once more so we can add all data from all temporary reports | |
797 reset_counters_and_fhs($original_filename); | |
798 | |
799 ### Merging the Bismark mapping report files | |
800 if ($single_end){ | |
801 merge_individual_mapping_reports(\@temp_reports,$original_filename,$single_end); | |
802 print_final_analysis_report_single_end('mock_file1','mock_file_2','mock_pid','mergeThis'); | |
803 } | |
804 else{ | |
805 merge_individual_mapping_reports(\@temp_reports,$original_filename_1,$single_end,$original_filename_2); | |
806 print_final_analysis_report_paired_ends('mock_file1','mock_file_2','mock_file3','mock_file_4','mock_pid','mergeThis'); | |
807 } | |
808 | |
809 # deleting temp report files | |
810 warn "Deleting temporary report files...\n"; | |
811 foreach my $temp (@temp_reports){ | |
812 print "$temp\t"; | |
813 unlink "${output_dir}${temp}" or warn "Failed to delete temporary report file $output_dir$temp: $!\n"; | |
814 } | |
815 print "\n\n"; | |
816 | |
817 } | |
818 | |
819 } | |
820 | |
821 if ($pid){ # only for the Parent | |
822 warn "\n====================\nBismark run complete\n====================\n\n"; | |
823 | |
824 if ($nucleotide_coverage){ | |
825 warn "Now calculating observed and expected nucleotide coverage statistics... \n\n"; | |
826 if ($final_output_filename =~ /(bam|cram)|/){ | |
827 my @args; | |
828 push @args, "--genome $genome_folder"; | |
829 push @args, "--dir '$output_dir'"; | |
830 push @args, "--samtools_path $samtools_path"; | |
831 push @args, $final_output_filename; | |
832 print "@args","\n"; sleep(3); | |
833 | |
834 system ("$Bin/bam2nuc @args"); | |
835 warn "Finished bam2nuc calculation ...\n\n"; | |
836 | |
837 } | |
838 else{ | |
839 warn "Nucleotide coverage statistics are currently only available for BAM or CRAM files\n\n"; | |
840 } | |
841 } | |
842 | |
843 } | |
844 | |
845 } | |
846 | |
847 sub merge_individual_mapping_reports{ | |
848 | |
849 my ($temp_reports,$original_filename_1,$single_end,$original_filename_2) = @_; | |
850 my $report_file = $original_filename_1; | |
851 $report_file =~ s/.*\///; # removing path information | |
852 $report_file =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter | |
853 | |
854 if ($prefix){ | |
855 $report_file = "${prefix}.${report_file}"; | |
856 } | |
857 | |
858 if ($basename){ # Output file basename is set using the -B argument | |
859 $report_file = ${basename}; | |
860 } | |
861 | |
862 if ($single_end){ | |
863 if ($bowtie2){ | |
864 $report_file .= '_bismark_bt2_SE_report.txt'; | |
865 } | |
866 else{ | |
867 $report_file .= '_bismark_SE_report.txt'; | |
868 } | |
869 } | |
870 else{ | |
871 if ($bowtie2){ | |
872 $report_file .= '_bismark_bt2_PE_report.txt'; | |
873 } | |
874 else{ | |
875 $report_file .= '_bismark_PE_report.txt'; | |
876 } | |
877 } | |
878 warn "Writing report to ${output_dir}${report_file}\n"; | |
879 open (REPORT,'>',"$output_dir$report_file") or die "Failed to write to ${output_dir}${report_file}: $!\n"; | |
880 | |
881 foreach my $temp(@$temp_reports){ | |
882 $temp =~ s/.*\///; # removing path information | |
883 } | |
884 | |
885 warn "Now merging temporary reports @$temp_reports into >>> ${output_dir}${report_file} <<<\n"; | |
886 | |
887 if ($single_end){ | |
888 print REPORT "Bismark report for: $original_filename_1 (version: $bismark_version)\n"; | |
889 } | |
890 else{ # paired-end | |
891 print REPORT "Bismark report for: $original_filename_1 and $original_filename_2 (version: $bismark_version)\n"; | |
892 } | |
893 | |
894 | |
895 my $first = 0; | |
896 | |
897 foreach my $temp(@$temp_reports){ | |
898 # $temp =~ s/.*\///; # removing path information | |
899 | |
900 warn "Merging from file >> $temp <<\n"; | |
901 open (IN,"${output_dir}${temp}") or die "Failed to read from temporary mapping report '${output_dir}${temp}'\n"; | |
902 | |
903 ### this is printing the first couple of lines | |
904 while (<IN>){ | |
905 chomp; | |
906 if ($_ =~ /^Bismark report/){ | |
907 next; | |
908 } | |
909 | |
910 unless ($first){ # only happens for the first run we are processing | |
911 if ($_ =~ /^Final Alignment/){ | |
912 ++$first; | |
913 last; | |
914 } | |
915 else{ | |
916 print REPORT "$_\n"; | |
917 } | |
918 } | |
919 } | |
920 close IN or warn "Failed to close filehandle\n"; | |
921 | |
922 ### Simon says: You are going to regret this in the future. Just for the record. He might be right... | |
923 read_alignment_report($temp,$single_end); | |
924 | |
925 } | |
926 warn "\n"; | |
927 | |
928 } | |
929 | |
930 sub read_alignment_report{ | |
931 my ($report,$single_end) = @_; | |
932 | |
933 my $unique; | |
934 my $no_aln; | |
935 my $multiple; | |
936 my $no_genomic; | |
937 my $total_seqs; | |
938 my $bismark_version; | |
939 my $input_filename; | |
940 | |
941 my $unique_text; | |
942 my $no_aln_text; | |
943 my $multiple_text; | |
944 my $total_seq_text; | |
945 | |
946 my $total_C_count; | |
947 my ($meth_CpG,$meth_CHG,$meth_CHH,$meth_unknown); | |
948 my ($unmeth_CpG,$unmeth_CHG,$unmeth_CHH,$unmeth_unknown); | |
949 | |
950 my $number_OT; | |
951 my $number_CTOT; | |
952 my $number_CTOB; | |
953 my $number_OB; | |
954 | |
955 open (ALN,"${output_dir}${report}") or die "Failed to read from temporary mapping report '$output_dir$report'\n"; | |
956 | |
957 while (<ALN>){ | |
958 chomp; | |
959 | |
960 ### General Alignment stats | |
961 if ($_ =~ /^Sequence pairs analysed in total:/ ){ ## Paired-end | |
962 (undef,$total_seqs) = split /\t/; | |
963 # warn "Total paired seqs: >> $total_seqs <<\n"; | |
964 } | |
965 elsif ($_ =~ /^Sequences analysed in total:/ ){ ## Single-end | |
966 (undef,$total_seqs) = split /\t/; | |
967 # warn "total single-end seqs >> $total_seqs <<\n"; | |
968 } | |
969 | |
970 elsif($_ =~ /^Number of paired-end alignments with a unique best hit:/){ ## Paired-end | |
971 (undef,$unique) = split /\t/; | |
972 # warn "Unique PE>> $unique <<\n"; | |
973 } | |
974 elsif($_ =~ /^Number of alignments with a unique best hit from/){ ## Single-end | |
975 (undef,$unique) = split /\t/; | |
976 # warn "Unique SE>> $unique <<\n"; | |
977 } | |
978 | |
979 elsif($_ =~ /^Sequence pairs with no alignments under any condition:/){ ## Paired-end | |
980 (undef,$no_aln) = split /\t/; | |
981 # warn "No alignment PE >> $no_aln <<\n"; | |
982 } | |
983 elsif($_ =~ /^Sequences with no alignments under any condition:/){ ## Single-end | |
984 (undef,$no_aln) = split /\t/; | |
985 # warn "No alignments SE>> $no_aln <<\n"; | |
986 } | |
987 | |
988 elsif($_ =~ /^Sequence pairs did not map uniquely:/){ ## Paired-end | |
989 (undef,$multiple) = split /\t/; | |
990 # warn "Multiple alignments PE >> $multiple <<\n"; | |
991 } | |
992 elsif($_ =~ /^Sequences did not map uniquely:/){ ## Single-end | |
993 (undef,$multiple) = split /\t/; | |
994 # warn "Multiple alignments SE >> $multiple <<\n"; | |
995 } | |
996 | |
997 elsif($_ =~ /^Sequence pairs which were discarded because genomic sequence could not be extracted:/){ ## Paired-end | |
998 (undef,$no_genomic) = split /\t/; | |
999 # warn "No genomic sequence PE >> $no_genomic <<\n"; | |
1000 } | |
1001 elsif($_ =~ /^Sequences which were discarded because genomic sequence could not be extracted:/){ ## Single-end | |
1002 (undef,$no_genomic) = split /\t/; | |
1003 # warn "No genomic sequence SE>> $no_genomic <<\n"; | |
1004 } | |
1005 | |
1006 ### Context Methylation | |
1007 elsif($_ =~ /^Total number of C/ ){ | |
1008 (undef,$total_C_count) = split /\t/; | |
1009 # warn "Total number C >> $total_C_count <<\n"; | |
1010 } | |
1011 | |
1012 elsif($_ =~ /^Total methylated C\'s in CpG context:/ ){ | |
1013 (undef,$meth_CpG) = split /\t/; | |
1014 # warn "meth CpG >> $meth_CpG <<\n" ; | |
1015 } | |
1016 elsif($_ =~ /^Total methylated C\'s in CHG context:/ ){ | |
1017 (undef,$meth_CHG) = split /\t/; | |
1018 # warn "meth CHG >> $meth_CHG <<\n" ; | |
1019 } | |
1020 elsif($_ =~ /^Total methylated C\'s in CHH context:/ ){ | |
1021 (undef,$meth_CHH) = split /\t/; | |
1022 # warn "meth CHH >> $meth_CHH <<\n" ; | |
1023 } | |
1024 elsif($_ =~ /^Total methylated C\'s in Unknown context:/ ){ | |
1025 (undef,$meth_unknown) = split /\t/; | |
1026 # warn "meth Unknown >> $meth_unknown <<\n" ; | |
1027 } | |
1028 | |
1029 elsif($_ =~ /^Total unmethylated C\'s in CpG context:/ or $_ =~ /^Total C to T conversions in CpG context:/){ | |
1030 (undef,$unmeth_CpG) = split /\t/; | |
1031 # warn "unmeth CpG >> $unmeth_CpG <<\n" ; | |
1032 } | |
1033 elsif($_ =~ /^Total unmethylated C\'s in CHG context:/ or $_ =~ /^Total C to T conversions in CHG context:/){ | |
1034 (undef,$unmeth_CHG) = split /\t/; | |
1035 # warn "unmeth CHG >> $unmeth_CHG <<\n" ; | |
1036 } | |
1037 elsif($_ =~ /^Total unmethylated C\'s in CHH context:/ or $_ =~ /^Total C to T conversions in CHH context:/){ | |
1038 (undef,$unmeth_CHH) = split /\t/; | |
1039 # warn "unmeth CHH >> $unmeth_CHH <<\n"; | |
1040 } | |
1041 elsif($_ =~ /^Total unmethylated C\'s in Unknown context:/ or $_ =~ /^Total C to T conversions in Unknown context:/){ | |
1042 (undef,$unmeth_unknown) = split /\t/; | |
1043 # warn "unmeth Unknown >> $unmeth_unknown <<\n" ; | |
1044 } | |
1045 | |
1046 ### Strand Origin | |
1047 | |
1048 elsif($_ =~ /^CT\/GA\/CT:/ ){ ## Paired-end | |
1049 (undef,$number_OT) = split /\t/; | |
1050 # warn "Number OT PE>> $number_OT <<\n" ; | |
1051 } | |
1052 elsif($_ =~ /^CT\/CT:/ ){ ## Single-end | |
1053 (undef,$number_OT) = split /\t/; | |
1054 # warn "Number OT SE>> $number_OT <<\n" ; | |
1055 } | |
1056 | |
1057 elsif($_ =~ /^GA\/CT\/CT:/ ){ ## Paired-end | |
1058 (undef,$number_CTOT) = split /\t/; | |
1059 # warn "Number CTOT PE >> $number_CTOT <<\n" ; | |
1060 } | |
1061 elsif($_ =~ /^GA\/CT:/ ){ ## Single-end | |
1062 (undef,$number_CTOT) = split /\t/; | |
1063 # warn "Number CTOT SE >> $number_CTOT <<\n" ; | |
1064 } | |
1065 | |
1066 elsif($_ =~ /^GA\/CT\/GA:/ ){ ## Paired-end | |
1067 (undef,$number_CTOB) = split /\t/; | |
1068 # warn "Number CTOB PE >> $number_CTOB <<\n" ; | |
1069 } | |
1070 elsif($_ =~ /^GA\/GA:/ ){ ## Single-end | |
1071 (undef,$number_CTOB) = split /\t/; | |
1072 # warn "Number CTOB SE >> $number_CTOB <<\n"; | |
1073 } | |
1074 | |
1075 elsif($_ =~ /^CT\/GA\/GA:/ ){ ## Paired-end | |
1076 (undef,$number_OB) = split /\t/; | |
1077 # warn "Number OB PE >> $number_OB <<\n"; | |
1078 } | |
1079 elsif($_ =~ /^CT\/GA:/ ){ ## Single-end | |
1080 (undef,$number_OB) = split /\t/; | |
1081 # warn "Number OB SE >> $number_OB <<\n"; | |
1082 } | |
1083 } | |
1084 | |
1085 $counting{sequences_count} += $total_seqs; | |
1086 $counting{unique_best_alignment_count} += $unique; | |
1087 $counting{no_single_alignment_found} += $no_aln; | |
1088 $counting{unsuitable_sequence_count} += $multiple; | |
1089 $counting{genomic_sequence_could_not_be_extracted_count} += $no_genomic; | |
1090 | |
1091 $counting{total_meCHH_count} += $meth_CHH; | |
1092 $counting{total_meCHG_count} += $meth_CHG; | |
1093 $counting{total_meCpG_count} += $meth_CpG; | |
1094 if ($bowtie2){ | |
1095 $counting{total_meC_unknown_count} += $meth_unknown; | |
1096 } | |
1097 | |
1098 $counting{total_unmethylated_CHH_count} += $unmeth_CHH; | |
1099 $counting{total_unmethylated_CHG_count} += $unmeth_CHG; | |
1100 $counting{total_unmethylated_CpG_count} += $unmeth_CpG; | |
1101 if ($bowtie2){ | |
1102 $counting{total_unmethylated_C_unknown_count} += $unmeth_unknown; | |
1103 } | |
1104 | |
1105 if ($single_end){ | |
1106 $counting{CT_CT_count} += $number_OT; | |
1107 $counting{CT_GA_count} += $number_OB; | |
1108 $counting{GA_CT_count} += $number_CTOT; | |
1109 $counting{GA_GA_count} += $number_CTOB; | |
1110 } | |
1111 else{ | |
1112 # paired-end | |
1113 $counting{GA_CT_CT_count} += $number_CTOT; | |
1114 $counting{CT_GA_CT_count} += $number_OT; | |
1115 $counting{GA_CT_GA_count} += $number_CTOB; | |
1116 $counting{CT_GA_GA_count} += $number_OB; | |
1117 } | |
1118 } | |
1119 | |
1120 sub merge_individual_ambiguous_files{ | |
1121 | |
1122 my ($temp_ambiguous,$original_filename,$single_end,$paired_information) = @_; | |
1123 my $ambiguous_file = $original_filename; | |
1124 $ambiguous_file =~ s/.*\///; # removing path information | |
1125 | |
1126 if ($prefix){ | |
1127 $ambiguous_file = "${prefix}.${ambiguous_file}"; | |
1128 } | |
1129 | |
1130 if ($single_end){ | |
1131 | |
1132 if ($basename){ # Output file basename is set using the -B argument | |
1133 if ($sequence_file_format eq 'FASTQ'){ | |
1134 $ambiguous_file = "${basename}_ambiguous_reads.fq.gz"; | |
1135 } | |
1136 else{ | |
1137 $ambiguous_file = "${basename}_ambiguous_reads.fa.gz"; | |
1138 } | |
1139 } | |
1140 else{ | |
1141 if ($sequence_file_format eq 'FASTQ'){ | |
1142 $ambiguous_file =~ s/$/_ambiguous_reads.fq.gz/; | |
1143 } | |
1144 else{ | |
1145 $ambiguous_file =~ s/$/_ambiguous_reads.fa.gz/; | |
1146 } | |
1147 } | |
1148 } | |
1149 else{ # paired-end | |
1150 | |
1151 if ($basename){ # Output file basename is set using the -B argument | |
1152 if ($sequence_file_format eq 'FASTQ'){ | |
1153 $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fq.gz"; | |
1154 } | |
1155 else{ | |
1156 $ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fa.gz"; | |
1157 } | |
1158 } | |
1159 else{ | |
1160 if ($sequence_file_format eq 'FASTQ'){ | |
1161 $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fq.gz/; | |
1162 } | |
1163 else{ | |
1164 $ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fa.gz/; | |
1165 } | |
1166 } | |
1167 } | |
1168 | |
1169 foreach my $temp(@$temp_ambiguous){ | |
1170 $temp =~ s/.*\///; # removing path information | |
1171 } | |
1172 | |
1173 open (AMBIGUOUS,"| gzip -c - > $output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; | |
1174 warn "Now merging ambiguous sequences @$temp_ambiguous into >>> $output_dir$ambiguous_file <<<\n"; | |
1175 | |
1176 foreach my $temp(@$temp_ambiguous){ | |
1177 warn "Merging from file >> $temp <<\n"; | |
1178 if ($temp =~ /gz$/){ | |
1179 open (IN,"gunzip -c ${output_dir}$temp |") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n"; | |
1180 } | |
1181 else{ | |
1182 open (IN,"${output_dir}$temp") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n"; | |
1183 } | |
1184 | |
1185 while (<IN>){ | |
1186 print AMBIGUOUS; | |
1187 } | |
1188 close IN or warn "Failed to close filehandle\n"; | |
1189 } | |
1190 warn "\n"; | |
1191 | |
1192 close AMBIGUOUS or warn "Failed to close output filehandle AMBIGUOUS\n\n"; | |
1193 } | |
1194 | |
1195 | |
1196 sub merge_individual_unmapped_files{ | |
1197 | |
1198 my ($temp_unmapped,$original_filename,$single_end,$paired_information) = @_; | |
1199 my $unmapped_file = $original_filename; | |
1200 $unmapped_file =~ s/.*\///; # removing path information | |
1201 | |
1202 if ($prefix){ | |
1203 $unmapped_file = "${prefix}.${unmapped_file}"; | |
1204 } | |
1205 | |
1206 if ($single_end){ | |
1207 | |
1208 if ($basename){ # Output file basename is set using the -B argument | |
1209 if ($sequence_file_format eq 'FASTQ'){ | |
1210 $unmapped_file = "${basename}_unmapped_reads.fq.gz"; | |
1211 } | |
1212 else{ | |
1213 $unmapped_file = "${basename}_unmapped_reads.fa.gz"; | |
1214 } | |
1215 } | |
1216 else{ | |
1217 if ($sequence_file_format eq 'FASTQ'){ | |
1218 $unmapped_file =~ s/$/_unmapped_reads.fq.gz/; | |
1219 } | |
1220 else{ | |
1221 $unmapped_file =~ s/$/_unmapped_reads.fa.gz/; | |
1222 } | |
1223 } | |
1224 } | |
1225 else{ # paired-end | |
1226 | |
1227 if ($basename){ # Output file basename is set using the -B argument | |
1228 if ($sequence_file_format eq 'FASTQ'){ | |
1229 $unmapped_file = "${basename}_unmapped_reads${paired_information}.fq.gz"; | |
1230 } | |
1231 else{ | |
1232 $unmapped_file = "${basename}_unmapped_reads${paired_information}.fa.gz"; | |
1233 } | |
1234 } | |
1235 else{ | |
1236 if ($sequence_file_format eq 'FASTQ'){ | |
1237 $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fq.gz/; | |
1238 } | |
1239 else{ | |
1240 $unmapped_file =~ s/$/_unmapped_reads${paired_information}.fa.gz/; | |
1241 } | |
1242 } | |
1243 } | |
1244 | |
1245 foreach my $temp(@$temp_unmapped){ | |
1246 $temp =~ s/.*\///; # removing path information | |
1247 } | |
1248 | |
1249 open (UNMAPPED,"| gzip -c - > ${output_dir}${unmapped_file}") or die "Failed to write to ${output_dir}${unmapped_file}: $!\n"; | |
1250 warn "Now merging unmapped sequences @$temp_unmapped into >>> ${output_dir}${unmapped_file} <<<\n"; | |
1251 | |
1252 foreach my $temp(@$temp_unmapped){ | |
1253 warn "Merging from file >> $temp <<\n"; | |
1254 if ($temp =~ /gz$/){ | |
1255 open (IN,"gunzip -c ${output_dir}${temp} |") or die "Failed to read from unmapped temp file '${output_dir}$temp'\n"; | |
1256 } | |
1257 else{ | |
1258 open (IN,"${output_dir}${temp}") or die "Failed to read from unmapped temp file '${output_dir}${temp}'\n"; | |
1259 } | |
1260 | |
1261 while (<IN>){ | |
1262 print UNMAPPED; | |
1263 } | |
1264 close IN or warn "Failed to close filehandle\n"; | |
1265 } | |
1266 warn "\n"; | |
1267 | |
1268 close UNMAPPED or warn "Failed to close output filehandle UNMAPPED\n\n"; | |
1269 } | |
1270 | |
1271 | |
1272 sub merge_individual_BAM_files{ | |
1273 | |
1274 my ($tempbam,$original_filename,$single_end) = @_; | |
1275 my $merged_name = $original_filename; | |
1276 | |
1277 #warn "merged name is: $merged_name\n"; | |
1278 $merged_name =~ s/.*\///; # deleting path information | |
1279 # warn "merged name is: $merged_name\n"; | |
1280 $merged_name =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter | |
1281 # warn "merged name is: $merged_name\n"; sleep(5); | |
1282 | |
1283 foreach my $temp_bam(@$tempbam){ | |
1284 $temp_bam =~ s/.*\///; # deleting path information | |
1285 } | |
1286 | |
1287 if ($prefix){ | |
1288 $merged_name = "$prefix.$merged_name"; | |
1289 } | |
1290 | |
1291 if ($single_end){ | |
1292 if ($bowtie2){ # BAM format is the default for Bowtie 2 | |
1293 $merged_name .= '_bismark_bt2.bam'; | |
1294 } | |
1295 else{ # BAM is the default output | |
1296 $merged_name .= '_bismark.bam'; | |
1297 } | |
1298 | |
1299 if ($basename){ # Output file basename is set using the -B argument | |
1300 $merged_name = "${basename}.bam"; | |
1301 } | |
1302 } | |
1303 else{ | |
1304 if ($bowtie2){ # BAM format is the default for Bowtie 2 | |
1305 $merged_name .= '_bismark_bt2_pe.bam'; | |
1306 } | |
1307 else{ # BAM is the default output | |
1308 $merged_name .= '_bismark_pe.bam'; | |
1309 } | |
1310 | |
1311 if ($basename){ # Output file basename is set using the -B argument | |
1312 $merged_name = "${basename}_pe.bam"; | |
1313 } | |
1314 } | |
1315 | |
1316 | |
1317 if ($cram){ | |
1318 $merged_name =~ s/bam$/cram/; | |
1319 warn "At this stage we write out a single CRAM file and delete all temporary BAM files\n"; | |
1320 warn "Now merging BAM files @$tempbam into >>> $merged_name <<<\n"; | |
1321 $final_output_filename = "${output_dir}${merged_name}"; | |
1322 | |
1323 open (OUT,"| $samtools_path view -h -C -T $cram_ref 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to CRAM file $merged_name: $!\nPlease note that this option requires Samtools version 1.2 or higher!\n\n"; | |
1324 } | |
1325 else{ | |
1326 $final_output_filename = "${output_dir}${merged_name}"; | |
1327 warn "Now merging BAM files @$tempbam into >>> $merged_name <<<\n"; | |
1328 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to $merged_name: $!\n"; | |
1329 } | |
1330 | |
1331 my $first = 0; | |
1332 | |
1333 foreach my $temp_bam(@$tempbam){ | |
1334 # $temp_bam =~ s/.*\///; # deleting path information | |
1335 | |
1336 warn "Merging from file >> $temp_bam <<\n"; | |
1337 | |
1338 if ($first > 0){ | |
1339 open (IN,"$samtools_path view ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; | |
1340 } | |
1341 else{ # only for the first file we print the header as well | |
1342 open (IN,"$samtools_path view -h ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; | |
1343 } | |
1344 | |
1345 while (<IN>){ | |
1346 print OUT; | |
1347 } | |
1348 close IN or warn "Failed to close filehandle\n"; | |
1349 ++$first; | |
1350 } | |
1351 warn "\n"; | |
1352 | |
1353 close OUT or warn "Failed to close output filehandle\n\n"; | |
1354 | |
1355 } | |
1356 | |
1357 | |
1358 sub merge_individual_ambig_BAM_files{ | |
1359 | |
1360 my ($tempbam,$original_filename,$single_end) = @_; | |
1361 my $merged_name = $original_filename; | |
1362 | |
1363 # warn "merged name is: $merged_name\n"; | |
1364 $merged_name =~ s/.*\///; # deleting path information | |
1365 # warn "merged name is: $merged_name\n"; sleep(1); | |
1366 | |
1367 foreach my $temp_bam(@$tempbam){ | |
1368 $temp_bam =~ s/.*\///; # deleting path information | |
1369 } | |
1370 | |
1371 if ($prefix){ | |
1372 $merged_name = "$prefix.$merged_name"; | |
1373 } | |
1374 | |
1375 if ($single_end){ | |
1376 if ($bowtie2){ # BAM format is the default for Bowtie 2 | |
1377 $merged_name .= '_bismark_bt2.ambig.bam'; | |
1378 } | |
1379 | |
1380 if ($basename){ # Output file basename is set using the -B argument | |
1381 $merged_name = "${basename}.ambig.bam"; | |
1382 } | |
1383 } | |
1384 else{ | |
1385 if ($bowtie2){ # BAM format is the default for Bowtie 2 | |
1386 $merged_name .= '_bismark_bt2_pe.ambig.bam'; | |
1387 } | |
1388 | |
1389 if ($basename){ # Output file basename is set using the -B argument | |
1390 $merged_name = "${basename}_pe.ambig.bam"; | |
1391 } | |
1392 } | |
1393 | |
1394 warn "Now merging ambiguous BAM files @$tempbam into >>> $merged_name <<<\n"; | |
1395 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to $merged_name: $!\n"; | |
1396 my $first = 0; | |
1397 | |
1398 foreach my $temp_bam(@$tempbam){ | |
1399 # $temp_bam =~ s/.*\///; # deleting path information | |
1400 | |
1401 warn "Merging from file >> $temp_bam <<\n"; | |
1402 | |
1403 if ($first > 0){ | |
1404 open (IN,"$samtools_path view ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; | |
1405 } | |
1406 else{ # only for the first file we print the header as well | |
1407 open (IN,"$samtools_path view -h ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n"; | |
1408 } | |
1409 | |
1410 while (<IN>){ | |
1411 print OUT; | |
1412 } | |
1413 close IN or warn "Failed to close filehandle\n"; | |
1414 ++$first; | |
1415 } | |
1416 warn "\n"; | |
1417 | |
1418 close OUT or warn "Failed to close output filehandle\n\n"; | |
1419 } | |
1420 | |
1421 sub start_methylation_call_procedure_single_ends { | |
1422 my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; | |
1423 my ($dir,$filename); | |
1424 | |
1425 if ($sequence_file =~ /\//){ | |
1426 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/; | |
1427 } | |
1428 else{ | |
1429 $filename = $sequence_file; | |
1430 } | |
1431 | |
1432 ### printing all alignments to a results file | |
1433 my $outfile = $filename; | |
1434 # warn "Outfile: $outfile\n"; | |
1435 $outfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter | |
1436 # warn "Outfile: $outfile\n";sleep(5); | |
1437 | |
1438 if ($prefix){ | |
1439 $outfile = "$prefix.$outfile"; | |
1440 } | |
1441 if ($bowtie2){ # SAM format is the default for Bowtie 2 | |
1442 $outfile =~ s/$/_bismark_bt2.sam/; | |
1443 } | |
1444 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X) | |
1445 $outfile =~ s/$/_bismark.txt/; | |
1446 } | |
1447 else{ # SAM is the default output | |
1448 $outfile =~ s/$/_bismark.sam/; | |
1449 } | |
1450 | |
1451 if ($basename){ # Output file basename is set using the -B argument | |
1452 $outfile = "${basename}.sam"; | |
1453 } | |
1454 | |
1455 $bam = 0 unless (defined $bam); | |
1456 | |
1457 if ($ambig_bam){ | |
1458 my $ambig_bam_out = $outfile; | |
1459 $ambig_bam_out =~ s/sam$/ambig.bam/; | |
1460 warn "Ambiguous BAM output: $ambig_bam_out\n"; | |
1461 open (AMBIBAM,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$ambig_bam_out") or die "Failed to write to $ambig_bam_out: $!\n"; | |
1462 } | |
1463 | |
1464 if ($cram){ ### Samtools is installed, writing out CRAM directly. This qill require Samtools version 1.2 or higher! | |
1465 ### for multicore processing we write out BAM files by default and merge them together as a single CRAM file in the merging step later on. | |
1466 ### This avoids having to change all the the file endings on the way | |
1467 if($multicore > 1){ | |
1468 $outfile =~ s/sam$/bam/; | |
1469 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1470 } | |
1471 else{ # single-core mode | |
1472 $outfile =~ s/sam$/cram/; | |
1473 $final_output_filename = "${output_dir}${outfile}"; | |
1474 open (OUT,"| $samtools_path view -h -C -T $cram_ref 2>/dev/null - > $output_dir$outfile") or die "Failed to write to CRAM file $outfile: $!\nPlease note that this option requires Samtools version 1.2 or higher!\n\n"; | |
1475 } | |
1476 } | |
1477 elsif($bam == 1){ ### Samtools is installed, writing out BAM directly | |
1478 $outfile =~ s/sam$/bam/; | |
1479 $final_output_filename = "${output_dir}${outfile}"; | |
1480 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1481 } | |
1482 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead | |
1483 $outfile .= '.gz'; | |
1484 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1485 } | |
1486 else{ # uncompressed ouput, default | |
1487 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1488 } | |
1489 warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n"; | |
1490 | |
1491 | |
1492 sleep(1); | |
1493 | |
1494 if ($vanilla){ | |
1495 print OUT "Bismark version: $bismark_version\n"; | |
1496 } | |
1497 | |
1498 ### printing alignment and methylation call summary to a report file | |
1499 my $reportfile = $filename; | |
1500 $reportfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter | |
1501 | |
1502 if ($prefix){ | |
1503 $reportfile = "$prefix.$reportfile"; | |
1504 } | |
1505 if ($bowtie2){ | |
1506 $reportfile =~ s/$/_bismark_bt2_SE_report.txt/; | |
1507 } | |
1508 else{ | |
1509 $reportfile =~ s/$/_bismark_SE_report.txt/; | |
1510 } | |
1511 | |
1512 if ($basename){ # Output file basename is set using the -B argument | |
1513 $reportfile = "${basename}_SE_report.txt"; | |
1514 } | |
1515 | |
1516 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; | |
1517 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n"; | |
1518 | |
1519 if ($unmapped){ | |
1520 my $unmapped_file = $filename; | |
1521 if ($prefix){ | |
1522 $unmapped_file = "$prefix.$unmapped_file"; | |
1523 } | |
1524 | |
1525 if ($basename){ # Output file basename is set using the -B argument | |
1526 if ($sequence_file_format eq 'FASTQ'){ | |
1527 $unmapped_file = "${basename}_unmapped_reads.fq"; | |
1528 } | |
1529 else{ | |
1530 $unmapped_file = "${basename}_unmapped_reads.fa"; | |
1531 } | |
1532 } | |
1533 else{ | |
1534 if ($sequence_file_format eq 'FASTQ'){ | |
1535 $unmapped_file =~ s/$/_unmapped_reads.fq/; | |
1536 } | |
1537 else{ | |
1538 $unmapped_file =~ s/$/_unmapped_reads.fa/; | |
1539 } | |
1540 } | |
1541 | |
1542 if ($multicore > 1){ # multicore runs already output gzipped unmapped files | |
1543 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; | |
1544 } | |
1545 else{ | |
1546 $unmapped_file .= '.gz'; | |
1547 open (UNMAPPED,"| gzip -c - > $output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n"; | |
1548 } | |
1549 warn "Unmapped sequences will be written to $output_dir$unmapped_file\n"; | |
1550 } | |
1551 | |
1552 if ($ambiguous){ | |
1553 my $ambiguous_file = $filename; | |
1554 | |
1555 if ($prefix){ | |
1556 $ambiguous_file = "$prefix.$ambiguous_file"; | |
1557 } | |
1558 | |
1559 if ($basename){ # Output file basename is set using the -B argument | |
1560 if ($sequence_file_format eq 'FASTQ'){ | |
1561 $ambiguous_file = "${basename}_ambiguous_reads.fq"; | |
1562 } | |
1563 else{ | |
1564 $ambiguous_file = "${basename}_ambiguous_reads.fa"; | |
1565 } | |
1566 } | |
1567 else{ | |
1568 if ($sequence_file_format eq 'FASTQ'){ | |
1569 $ambiguous_file =~ s/$/_ambiguous_reads.fq/; | |
1570 } | |
1571 else{ | |
1572 $ambiguous_file =~ s/$/_ambiguous_reads.fa/; | |
1573 } | |
1574 } | |
1575 | |
1576 if ($multicore > 1){ # multicore runs already output gzipped amobiguous files | |
1577 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; | |
1578 } | |
1579 else{ | |
1580 $ambiguous_file .= '.gz'; | |
1581 open (AMBIG,"| gzip -c - > $output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n"; | |
1582 } | |
1583 warn "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n"; | |
1584 } | |
1585 | |
1586 if ($directional){ | |
1587 print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n"; | |
1588 } | |
1589 elsif ($pbat){ | |
1590 print REPORT "Option '--pbat' specified: alignments to original strands (OT and OB) strands were ignored (i.e. not performed)\n"; | |
1591 } | |
1592 else{ | |
1593 print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n"; | |
1594 } | |
1595 | |
1596 if ($bowtie2){ | |
1597 print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
1598 } | |
1599 else{ | |
1600 print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
1601 } | |
1602 | |
1603 | |
1604 unless ($vanilla or $sam_no_hd){ | |
1605 generate_SAM_header(); | |
1606 } | |
1607 | |
1608 ### Input file is in FastA format | |
1609 if ($sequence_file_format eq 'FASTA'){ | |
1610 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid); | |
1611 } | |
1612 ### Input file is in FastQ format | |
1613 else{ | |
1614 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid); | |
1615 } | |
1616 } | |
1617 | |
1618 sub start_methylation_call_procedure_paired_ends { | |
1619 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; | |
1620 my ($dir_1,$filename_1); | |
1621 | |
1622 if ($sequence_file_1 =~ /\//){ | |
1623 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/; | |
1624 } | |
1625 else{ | |
1626 $filename_1 = $sequence_file_1; | |
1627 } | |
1628 | |
1629 my ($dir_2,$filename_2); | |
1630 | |
1631 if ($sequence_file_2 =~ /\//){ | |
1632 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/; | |
1633 } | |
1634 else{ | |
1635 $filename_2 = $sequence_file_2; | |
1636 } | |
1637 | |
1638 ### printing all alignments to a results file | |
1639 my $outfile = $filename_1; | |
1640 # warn "Outfile: $outfile\n"; | |
1641 $outfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter | |
1642 # warn "Outfile: $outfile\n";sleep(5); | |
1643 | |
1644 if ($prefix){ | |
1645 $outfile = "$prefix.$outfile"; | |
1646 } | |
1647 if ($bowtie2){ # SAM format is the default Bowtie 2 output | |
1648 $outfile =~ s/$/_bismark_bt2_pe.sam/; | |
1649 } | |
1650 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X) | |
1651 $outfile =~ s/$/_bismark_pe.txt/; | |
1652 } | |
1653 else{ # SAM format is the default Bowtie 1 output | |
1654 $outfile =~ s/$/_bismark_pe.sam/; | |
1655 } | |
1656 | |
1657 | |
1658 if ($basename){ # Output file basename is set using the -B argument | |
1659 $outfile = "${basename}_pe.sam"; | |
1660 } | |
1661 | |
1662 if ($ambig_bam){ | |
1663 my $ambig_bam_out = $outfile; | |
1664 $ambig_bam_out =~ s/sam$/ambig.bam/; | |
1665 warn "Ambiguous BAM output: $ambig_bam_out\n"; | |
1666 open (AMBIBAM,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$ambig_bam_out") or die "Failed to write to $ambig_bam_out: $!\n"; | |
1667 } | |
1668 | |
1669 $bam = 0 unless (defined $bam); | |
1670 | |
1671 if ($cram){ ### Samtools is installed, writing out CRAM directly. This qill require Samtools version 1.2 or higher! | |
1672 if ($multicore > 1){ | |
1673 $outfile =~ s/sam$/bam/; | |
1674 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1675 } | |
1676 else{ # single-core mode | |
1677 $outfile =~ s/sam$/cram/; | |
1678 $final_output_filename = "${output_dir}${outfile}"; | |
1679 open (OUT,"| $samtools_path view -h -C -T $cram_ref 2>/dev/null - > $output_dir$outfile") or die "Failed to write to CRAM file $outfile: $!\nPlease note that this option requires Samtools version 1.2 or higher!\n\n"; | |
1680 } | |
1681 } | |
1682 elsif ($bam == 1){ ### Samtools is installed, writing out BAM directly | |
1683 $outfile =~ s/sam$/bam/; | |
1684 $final_output_filename = "${output_dir}${outfile}"; | |
1685 open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1686 } | |
1687 elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead | |
1688 $outfile .= '.gz'; | |
1689 open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1690 } | |
1691 else{ # uncompressed ouput, default | |
1692 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n"; | |
1693 } | |
1694 | |
1695 warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n"; | |
1696 sleep(1); | |
1697 | |
1698 if ($vanilla){ | |
1699 print OUT "Bismark version: $bismark_version\n"; | |
1700 } | |
1701 | |
1702 ### printing alignment and methylation call summary to a report file | |
1703 my $reportfile = $filename_1; | |
1704 $reportfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter | |
1705 | |
1706 if ($prefix){ | |
1707 $reportfile = "$prefix.$reportfile"; | |
1708 } | |
1709 | |
1710 if ($bowtie2){ | |
1711 $reportfile =~ s/$/_bismark_bt2_PE_report.txt/; | |
1712 } | |
1713 else{ | |
1714 $reportfile =~ s/$/_bismark_PE_report.txt/; | |
1715 } | |
1716 | |
1717 if ($basename){ # Output file basename is set using the -B argument | |
1718 $reportfile = "${basename}_PE_report.txt"; | |
1719 } | |
1720 | |
1721 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n"; | |
1722 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n"; | |
1723 | |
1724 if ($bowtie2){ | |
1725 print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n"; | |
1726 } | |
1727 else{ | |
1728 print REPORT "Bismark was run with Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n"; | |
1729 } | |
1730 | |
1731 | |
1732 ### Unmapped read output | |
1733 if ($unmapped){ | |
1734 my $unmapped_1 = $filename_1; | |
1735 my $unmapped_2 = $filename_2; | |
1736 | |
1737 if ($prefix){ | |
1738 $unmapped_1 = "$prefix.$unmapped_1"; | |
1739 $unmapped_2 = "$prefix.$unmapped_2"; | |
1740 } | |
1741 | |
1742 if ($basename){ # Output file basename is set using the -B argument | |
1743 if ($sequence_file_format eq 'FASTQ'){ | |
1744 $unmapped_1 = "${basename}_unmapped_reads_1.fq"; | |
1745 $unmapped_2 = "${basename}_unmapped_reads_2.fq"; | |
1746 } | |
1747 else{ | |
1748 $unmapped_1 = "${basename}_unmapped_reads_1.fa"; | |
1749 $unmapped_2 = "${basename}_unmapped_reads_2.fa"; | |
1750 } | |
1751 } | |
1752 else{ | |
1753 if ($sequence_file_format eq 'FASTQ'){ | |
1754 $unmapped_1 =~ s/$/_unmapped_reads_1.fq/; | |
1755 $unmapped_2 =~ s/$/_unmapped_reads_2.fq/; | |
1756 } | |
1757 else{ | |
1758 $unmapped_1 =~ s/$/_unmapped_reads_1.fa/; | |
1759 $unmapped_2 =~ s/$/_unmapped_reads_2.fa/; | |
1760 } | |
1761 } | |
1762 | |
1763 if ($multicore > 1){ # unmapped files are merged into .gz files in multicore runs anyway | |
1764 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; | |
1765 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; | |
1766 } | |
1767 else{ | |
1768 $unmapped_1 .= '.gz'; | |
1769 $unmapped_2 .= '.gz'; | |
1770 open (UNMAPPED_1,"| gzip -c - > $output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n"; | |
1771 open (UNMAPPED_2,"| gzip -c - > $output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n"; | |
1772 } | |
1773 warn "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n"; | |
1774 } | |
1775 | |
1776 if ($ambiguous){ | |
1777 my $amb_1 = $filename_1; | |
1778 my $amb_2 = $filename_2; | |
1779 | |
1780 if ($prefix){ | |
1781 $amb_1 = "$prefix.$amb_1"; | |
1782 $amb_2 = "$prefix.$amb_2"; | |
1783 } | |
1784 | |
1785 if ($basename){ # Output file basename is set using the -B argument | |
1786 if ($sequence_file_format eq 'FASTQ'){ | |
1787 $amb_1 = "${basename}_ambiguous_reads_1.fq"; | |
1788 $amb_2 = "${basename}_ambiguous_reads_2.fq"; | |
1789 } | |
1790 else{ | |
1791 $amb_1 = "${basename}_ambiguous_reads_1.fa"; | |
1792 $amb_2 = "${basename}_ambiguous_reads_2.fa"; | |
1793 } | |
1794 } | |
1795 else{ | |
1796 if ($sequence_file_format eq 'FASTQ'){ | |
1797 $amb_1 =~ s/$/_ambiguous_reads_1.fq/; | |
1798 $amb_2 =~ s/$/_ambiguous_reads_2.fq/; | |
1799 } | |
1800 else{ | |
1801 $amb_1 =~ s/$/_ambiguous_reads_1.fa/; | |
1802 $amb_2 =~ s/$/_ambiguous_reads_2.fa/; | |
1803 } | |
1804 } | |
1805 | |
1806 if ($multicore > 1){ # ambiguous files are merged into .gz files in multicore runs anyway | |
1807 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; | |
1808 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; | |
1809 } | |
1810 else{ | |
1811 $amb_1 .= '.gz'; | |
1812 $amb_2 .= '.gz'; | |
1813 open (AMBIG_1,"| gzip -c - > $output_dir$amb_1") or die "Failed to write to $amb_1: $!\n"; | |
1814 open (AMBIG_2,"| gzip -c - > $output_dir$amb_2") or die "Failed to write to $amb_2: $!\n"; | |
1815 } | |
1816 warn "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n"; | |
1817 } | |
1818 | |
1819 if ($directional){ | |
1820 print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n\n"; | |
1821 } | |
1822 elsif ($pbat){ | |
1823 print REPORT "Option '--pbat' specified: alignments to original strands (OT, OB) were ignored (i.e. not performed)\n\n"; | |
1824 } | |
1825 else{ | |
1826 print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n\n"; | |
1827 } | |
1828 | |
1829 | |
1830 | |
1831 unless ($vanilla or $sam_no_hd){ | |
1832 generate_SAM_header(); | |
1833 } | |
1834 | |
1835 ### Input files are in FastA format | |
1836 if ($sequence_file_format eq 'FASTA'){ | |
1837 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
1838 } | |
1839 ### Input files are in FastQ format | |
1840 else{ | |
1841 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
1842 } | |
1843 } | |
1844 | |
1845 sub print_final_analysis_report_single_end{ | |
1846 my ($C_to_T_infile,$G_to_A_infile,$pid,$merge_multi) = @_; | |
1847 | |
1848 if ($merge_multi){ | |
1849 warn "Printing a final merged alignment report for all individual sub-reports\n\n"; | |
1850 } | |
1851 else{ | |
1852 ### All sequences from the original sequence file have been analysed now | |
1853 ### deleting temporary C->T or G->A infiles | |
1854 | |
1855 if ($directional){ | |
1856 my $deletion_successful = unlink "$temp_dir$C_to_T_infile"; | |
1857 if ($deletion_successful == 1){ | |
1858 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n"; | |
1859 } | |
1860 else{ | |
1861 warn "Could not delete temporary file $C_to_T_infile properly $!\n"; | |
1862 } | |
1863 } | |
1864 elsif ($pbat){ | |
1865 my $deletion_successful = unlink "$temp_dir$G_to_A_infile"; | |
1866 if ($deletion_successful == 1){ | |
1867 warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n"; | |
1868 } | |
1869 else{ | |
1870 warn "Could not delete temporary file $G_to_A_infile properly $!\n"; | |
1871 } | |
1872 } | |
1873 else{ | |
1874 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile"; | |
1875 if ($deletion_successful == 2){ | |
1876 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n"; | |
1877 } | |
1878 else{ | |
1879 warn "Could not delete temporary files properly $!\n"; | |
1880 } | |
1881 } | |
1882 } | |
1883 | |
1884 ### printing a final report for the alignment procedure | |
1885 print REPORT "Final Alignment report\n",'='x22,"\n"; | |
1886 warn "Final Alignment report\n",'='x22,"\n"; | |
1887 # foreach my $index (0..$#fhs){ | |
1888 # print "$fhs[$index]->{name}\n"; | |
1889 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; | |
1890 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; | |
1891 # } | |
1892 | |
1893 ### printing a final report for the methylation call procedure | |
1894 warn "Sequences analysed in total:\t$counting{sequences_count}\n"; | |
1895 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n"; | |
1896 my $percent_alignable_sequences; | |
1897 | |
1898 if ($counting{sequences_count} == 0){ | |
1899 $percent_alignable_sequences = 0; | |
1900 } | |
1901 else{ | |
1902 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); | |
1903 } | |
1904 | |
1905 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n"; | |
1906 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n"; | |
1907 | |
1908 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads), | |
1909 ### only calculating the percentage if there were any overruled alignments | |
1910 if ($counting{low_complexity_alignments_overruled_count}){ | |
1911 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count}); | |
1912 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n"; | |
1913 } | |
1914 | |
1915 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
1916 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
1917 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
1918 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; | |
1919 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; | |
1920 | |
1921 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
1922 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
1923 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
1924 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n"; | |
1925 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n"; | |
1926 | |
1927 if ($directional){ | |
1928 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
1929 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
1930 } | |
1931 | |
1932 ### detailed information about Cs analysed | |
1933 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1934 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count}; | |
1935 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1936 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1937 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1938 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
1939 if ($bowtie2){ | |
1940 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; | |
1941 } | |
1942 warn "\n"; | |
1943 | |
1944 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1945 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1946 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
1947 if ($bowtie2){ | |
1948 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
1949 } | |
1950 warn "\n"; | |
1951 | |
1952 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
1953 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
1954 | |
1955 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
1956 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
1957 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
1958 if ($bowtie2){ | |
1959 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; | |
1960 } | |
1961 print REPORT "\n"; | |
1962 | |
1963 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
1964 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
1965 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
1966 if ($bowtie2){ | |
1967 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
1968 } | |
1969 print REPORT "\n"; | |
1970 | |
1971 my $percent_meCHG; | |
1972 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
1973 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
1974 } | |
1975 | |
1976 my $percent_meCHH; | |
1977 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
1978 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
1979 } | |
1980 | |
1981 my $percent_meCpG; | |
1982 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
1983 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
1984 } | |
1985 | |
1986 my $percent_meC_unknown; | |
1987 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ | |
1988 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); | |
1989 } | |
1990 | |
1991 | |
1992 ### printing methylated CpG percentage if applicable | |
1993 if ($percent_meCpG){ | |
1994 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1995 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
1996 } | |
1997 else{ | |
1998 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
1999 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
2000 } | |
2001 | |
2002 ### printing methylated C percentage (CHG context) if applicable | |
2003 if ($percent_meCHG){ | |
2004 warn "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
2005 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
2006 } | |
2007 else{ | |
2008 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
2009 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
2010 } | |
2011 | |
2012 ### printing methylated C percentage (CHH context) if applicable | |
2013 if ($percent_meCHH){ | |
2014 warn "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
2015 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
2016 } | |
2017 else{ | |
2018 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
2019 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
2020 } | |
2021 | |
2022 ### printing methylated C percentage (Unknown C context) if applicable | |
2023 if ($bowtie2){ | |
2024 if ($percent_meC_unknown){ | |
2025 warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
2026 print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
2027 } | |
2028 else{ | |
2029 warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; | |
2030 print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n"; | |
2031 } | |
2032 } | |
2033 print REPORT "\n\n"; | |
2034 warn "\n\n"; | |
2035 | |
2036 if ($seqID_contains_tabs){ | |
2037 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; | |
2038 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n"; | |
2039 } | |
2040 } | |
2041 | |
2042 | |
2043 sub print_final_analysis_report_paired_ends{ | |
2044 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid,$merge_multi) = @_; | |
2045 | |
2046 if ($merge_multi){ | |
2047 warn "Printing a final merged alignment report for all individual sub-reports\n\n"; | |
2048 } | |
2049 else{ | |
2050 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles | |
2051 if ($directional){ | |
2052 if ($G_to_A_infile_2){ | |
2053 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2"; | |
2054 if ($deletion_successful == 2){ | |
2055 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n"; | |
2056 } | |
2057 else{ | |
2058 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n"; | |
2059 } | |
2060 } | |
2061 else{ # for paired-end FastQ infiles with Bowtie1 there is only one file to delete | |
2062 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1"; | |
2063 if ($deletion_successful == 1){ | |
2064 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile_1\n\n"; | |
2065 } | |
2066 else{ | |
2067 warn "Could not delete temporary file $temp_dir$C_to_T_infile_1 properly: $!\n"; | |
2068 } | |
2069 } | |
2070 } | |
2071 else{ | |
2072 if ($G_to_A_infile_2 and $C_to_T_infile_2){ | |
2073 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2"; | |
2074 if ($deletion_successful == 4){ | |
2075 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n"; | |
2076 } | |
2077 else{ | |
2078 warn "Could not delete temporary files properly: $!\n"; | |
2079 } | |
2080 } | |
2081 else{ # for paired-end FastQ infiles with Bowtie1 there are only two files to delete | |
2082 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1"; | |
2083 if ($deletion_successful == 2){ | |
2084 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_1\n\n"; | |
2085 } | |
2086 else{ | |
2087 warn "Could not delete temporary files properly: $!\n"; | |
2088 } | |
2089 } | |
2090 } | |
2091 } | |
2092 | |
2093 ### printing a final report for the alignment procedure | |
2094 warn "Final Alignment report\n",'='x22,"\n"; | |
2095 print REPORT "Final Alignment report\n",'='x22,"\n"; | |
2096 # foreach my $index (0..$#fhs){ | |
2097 # print "$fhs[$index]->{name}\n"; | |
2098 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n"; | |
2099 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n"; | |
2100 # } | |
2101 | |
2102 ### printing a final report for the methylation call procedure | |
2103 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; | |
2104 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n"; | |
2105 | |
2106 my $percent_alignable_sequence_pairs; | |
2107 if ($counting{sequences_count} == 0){ | |
2108 $percent_alignable_sequence_pairs = 0; | |
2109 } | |
2110 else{ | |
2111 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count}); | |
2112 } | |
2113 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n"; | |
2114 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n"; | |
2115 | |
2116 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
2117 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
2118 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
2119 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; | |
2120 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; | |
2121 | |
2122 | |
2123 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n"; | |
2124 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n"; | |
2125 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n"; | |
2126 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n"; | |
2127 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n"; | |
2128 ### detailed information about Cs analysed | |
2129 | |
2130 if ($directional){ | |
2131 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
2132 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n"; | |
2133 } | |
2134 | |
2135 warn "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
2136 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n"; | |
2137 | |
2138 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count}; | |
2139 warn "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
2140 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
2141 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
2142 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
2143 if ($bowtie2){ | |
2144 warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n"; | |
2145 } | |
2146 warn "\n"; | |
2147 | |
2148 warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
2149 warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
2150 warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
2151 if ($bowtie2){ | |
2152 warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n"; | |
2153 } | |
2154 warn "\n"; | |
2155 | |
2156 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n"; | |
2157 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n"; | |
2158 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n"; | |
2159 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n"; | |
2160 if ($bowtie2){ | |
2161 print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n"; | |
2162 } | |
2163 print REPORT "\n"; | |
2164 | |
2165 print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n"; | |
2166 print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n"; | |
2167 print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n"; | |
2168 if ($bowtie2){ | |
2169 print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n"; | |
2170 } | |
2171 print REPORT "\n"; | |
2172 | |
2173 my $percent_meCHG; | |
2174 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){ | |
2175 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count})); | |
2176 } | |
2177 | |
2178 my $percent_meCHH; | |
2179 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){ | |
2180 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count})); | |
2181 } | |
2182 | |
2183 my $percent_meCpG; | |
2184 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){ | |
2185 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count})); | |
2186 } | |
2187 | |
2188 my $percent_meC_unknown; | |
2189 if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){ | |
2190 $percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count})); | |
2191 } | |
2192 | |
2193 | |
2194 ### printing methylated CpG percentage if applicable | |
2195 if ($percent_meCpG){ | |
2196 warn "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
2197 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n"; | |
2198 } | |
2199 else{ | |
2200 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
2201 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n"; | |
2202 } | |
2203 | |
2204 ### printing methylated C percentage in CHG context if applicable | |
2205 if ($percent_meCHG){ | |
2206 warn "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
2207 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n"; | |
2208 } | |
2209 else{ | |
2210 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
2211 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n"; | |
2212 } | |
2213 | |
2214 ### printing methylated C percentage in CHH context if applicable | |
2215 if ($percent_meCHH){ | |
2216 warn "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
2217 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n"; | |
2218 } | |
2219 else{ | |
2220 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
2221 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n"; | |
2222 } | |
2223 | |
2224 ### printing methylated C percentage (Unknown C context) if applicable | |
2225 if ($bowtie2){ | |
2226 if ($percent_meC_unknown){ | |
2227 warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
2228 print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n"; | |
2229 } | |
2230 else{ | |
2231 warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; | |
2232 print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n"; | |
2233 } | |
2234 } | |
2235 print REPORT "\n\n"; | |
2236 warn "\n\n"; | |
2237 | |
2238 } | |
2239 | |
2240 sub process_single_end_fastA_file_for_methylation_call{ | |
2241 my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; | |
2242 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. | |
2243 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either | |
2244 ### the C->T or G->A version | |
2245 | |
2246 ### gzipped version of the infile | |
2247 if ($sequence_file =~ /\.gz$/){ | |
2248 open (IN,"gunzip -c $sequence_file |") or die $!; | |
2249 } | |
2250 else{ | |
2251 open (IN,$sequence_file) or die $!; | |
2252 } | |
2253 | |
2254 my $count = 0; | |
2255 | |
2256 warn "\nReading in the sequence file $sequence_file\n"; | |
2257 while (1) { | |
2258 # last if ($counting{sequences_count} > 100); | |
2259 my $identifier = <IN>; | |
2260 my $sequence = <IN>; | |
2261 last unless ($identifier and $sequence); | |
2262 | |
2263 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
2264 | |
2265 ++$count; | |
2266 | |
2267 if ($skip){ | |
2268 next unless ($count > $skip); | |
2269 } | |
2270 if ($upto){ | |
2271 last if ($count > $upto); | |
2272 } | |
2273 | |
2274 $counting{sequences_count}++; | |
2275 if ($counting{sequences_count}%1000000==0) { | |
2276 warn "Processed $counting{sequences_count} sequences so far\n"; | |
2277 } | |
2278 chomp $sequence; | |
2279 chomp $identifier; | |
2280 | |
2281 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers | |
2282 | |
2283 my $return; | |
2284 if ($bowtie2){ | |
2285 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier); | |
2286 } | |
2287 else{ | |
2288 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1 | |
2289 } | |
2290 | |
2291 unless ($return){ | |
2292 $return = 0; | |
2293 } | |
2294 | |
2295 # print the sequence to ambiguous.out if --ambiguous was specified | |
2296 if ($ambiguous and $return == 2){ | |
2297 print AMBIG ">$identifier\n"; | |
2298 print AMBIG "$sequence\n"; | |
2299 } | |
2300 | |
2301 # print the sequence to <unmapped.out> file if --un was specified | |
2302 elsif ($unmapped and $return == 1){ | |
2303 print UNMAPPED ">$identifier\n"; | |
2304 print UNMAPPED "$sequence\n"; | |
2305 } | |
2306 } | |
2307 print "Processed $counting{sequences_count} sequences in total\n\n"; | |
2308 | |
2309 close OUT or warn "Failed to close filehandle OUT: $!\n"; | |
2310 | |
2311 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid); | |
2312 | |
2313 } | |
2314 | |
2315 sub process_single_end_fastQ_file_for_methylation_call{ | |
2316 | |
2317 my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_; | |
2318 | |
2319 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call. | |
2320 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either | |
2321 ### the C->T or G->A version | |
2322 | |
2323 ### gzipped version of the infile | |
2324 if ($sequence_file =~ /\.gz$/){ | |
2325 open (IN,"gunzip -c $sequence_file |") or die $!; | |
2326 } | |
2327 else{ | |
2328 open (IN,$sequence_file) or die $!; | |
2329 } | |
2330 | |
2331 my $count = 0; | |
2332 | |
2333 warn "\nReading in the sequence file $sequence_file\n"; | |
2334 while (1) { | |
2335 my $identifier = <IN>; | |
2336 my $sequence = <IN>; | |
2337 my $identifier_2 = <IN>; | |
2338 my $quality_value = <IN>; | |
2339 last unless ($identifier and $sequence and $identifier_2 and $quality_value); | |
2340 | |
2341 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
2342 | |
2343 ++$count; | |
2344 | |
2345 if ($skip){ | |
2346 next unless ($count > $skip); | |
2347 } | |
2348 if ($upto){ | |
2349 last if ($count > $upto); | |
2350 } | |
2351 | |
2352 $counting{sequences_count}++; | |
2353 | |
2354 if ($counting{sequences_count}%1000000==0) { | |
2355 warn "Processed $counting{sequences_count} sequences so far\n"; | |
2356 } | |
2357 chomp $sequence; | |
2358 chomp $identifier; | |
2359 chomp $quality_value; | |
2360 | |
2361 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers | |
2362 | |
2363 my $return; | |
2364 if ($bowtie2){ | |
2365 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value); | |
2366 } | |
2367 else{ | |
2368 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1 | |
2369 } | |
2370 | |
2371 unless ($return){ | |
2372 $return = 0; | |
2373 } | |
2374 | |
2375 # print the sequence to ambiguous.out if --ambiguous was specified | |
2376 if ($ambiguous and $return == 2){ | |
2377 print AMBIG "\@$identifier\n"; | |
2378 print AMBIG "$sequence\n"; | |
2379 print AMBIG $identifier_2; | |
2380 print AMBIG "$quality_value\n"; | |
2381 } | |
2382 | |
2383 # print the sequence to <unmapped.out> file if --un was specified | |
2384 elsif ($unmapped and $return == 1){ | |
2385 print UNMAPPED "\@$identifier\n"; | |
2386 print UNMAPPED "$sequence\n"; | |
2387 print UNMAPPED $identifier_2; | |
2388 print UNMAPPED "$quality_value\n"; | |
2389 } | |
2390 } | |
2391 print "Processed $counting{sequences_count} sequences in total\n\n"; | |
2392 | |
2393 close OUT or warn "Failed to close filehandle OUT: $!\n"; | |
2394 | |
2395 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid); | |
2396 if ($ambig_bam){ | |
2397 close AMBIBAM or warn "Had trouble closing filehandle AMBIBAM: $!\n"; | |
2398 } | |
2399 } | |
2400 | |
2401 sub process_fastA_files_for_paired_end_methylation_calls{ | |
2402 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; | |
2403 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to | |
2404 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping. | |
2405 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the | |
2406 ### converted genomes (either the C->T or G->A version) | |
2407 | |
2408 ### gzipped version of the infiles | |
2409 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ | |
2410 open (IN1,"gunzip -c $sequence_file_1 |") or die "Failed to open gunzip -c pipe to $sequence_file_1 $!\n"; | |
2411 open (IN2,"gunzip -c $sequence_file_2 |") or die "Failed to open gunzip -c pipe to $sequence_file_2 $!\n"; | |
2412 } | |
2413 else{ | |
2414 open (IN1,$sequence_file_1) or die $!; | |
2415 open (IN2,$sequence_file_2) or die $!; | |
2416 } | |
2417 | |
2418 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; | |
2419 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one | |
2420 | |
2421 my $count = 0; | |
2422 | |
2423 while (1) { | |
2424 # reading from the first input file | |
2425 my $identifier_1 = <IN1>; | |
2426 my $sequence_1 = <IN1>; | |
2427 # reading from the second input file | |
2428 my $identifier_2 = <IN2>; | |
2429 my $sequence_2 = <IN2>; | |
2430 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2); | |
2431 | |
2432 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces | |
2433 $identifier_2 = fix_IDs($identifier_2); | |
2434 | |
2435 ++$count; | |
2436 | |
2437 if ($skip){ | |
2438 next unless ($count > $skip); | |
2439 } | |
2440 if ($upto){ | |
2441 last if ($count > $upto); | |
2442 } | |
2443 | |
2444 $counting{sequences_count}++; | |
2445 if ($counting{sequences_count}%1000000==0) { | |
2446 warn "Processed $counting{sequences_count} sequence pairs so far\n"; | |
2447 } | |
2448 my $orig_identifier_1 = $identifier_1; | |
2449 my $orig_identifier_2 = $identifier_2; | |
2450 | |
2451 chomp $sequence_1; | |
2452 chomp $identifier_1; | |
2453 chomp $sequence_2; | |
2454 chomp $identifier_2; | |
2455 | |
2456 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers | |
2457 | |
2458 my $return; | |
2459 if ($bowtie2){ | |
2460 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1); | |
2461 } | |
2462 else{ | |
2463 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1); | |
2464 } | |
2465 | |
2466 unless ($return){ | |
2467 $return = 0; | |
2468 } | |
2469 | |
2470 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified | |
2471 if ($ambiguous and $return == 2){ | |
2472 print AMBIG_1 $orig_identifier_1; | |
2473 print AMBIG_1 "$sequence_1\n"; | |
2474 print AMBIG_2 $orig_identifier_2; | |
2475 print AMBIG_2 "$sequence_2\n"; | |
2476 } | |
2477 | |
2478 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified | |
2479 elsif ($unmapped and $return == 1){ | |
2480 print UNMAPPED_1 $orig_identifier_1; | |
2481 print UNMAPPED_1 "$sequence_1\n"; | |
2482 print UNMAPPED_2 $orig_identifier_2; | |
2483 print UNMAPPED_2 "$sequence_2\n"; | |
2484 } | |
2485 } | |
2486 | |
2487 warn "Processed $counting{sequences_count} sequences in total\n\n"; | |
2488 | |
2489 close OUT or die $!; | |
2490 | |
2491 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
2492 | |
2493 } | |
2494 | |
2495 sub process_fastQ_files_for_paired_end_methylation_calls{ | |
2496 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_; | |
2497 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to | |
2498 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments. | |
2499 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both) | |
2500 ### of the converted genomes (either C->T or G->A version) | |
2501 | |
2502 ### gzipped version of the infiles | |
2503 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){ | |
2504 open (IN1,"gunzip -c $sequence_file_1 |") or die "Failed to open gunzip -c pipe to $sequence_file_1 $!\n"; | |
2505 open (IN2,"gunzip -c $sequence_file_2 |") or die "Failed to open gunzip -c pipe to $sequence_file_2 $!\n"; | |
2506 } | |
2507 else{ | |
2508 open (IN1,$sequence_file_1) or die $!; | |
2509 open (IN2,$sequence_file_2) or die $!; | |
2510 } | |
2511 | |
2512 my $count = 0; | |
2513 | |
2514 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n"; | |
2515 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one | |
2516 while (1) { | |
2517 # reading from the first input file | |
2518 my $identifier_1 = <IN1>; | |
2519 my $sequence_1 = <IN1>; | |
2520 my $ident_1 = <IN1>; # not needed | |
2521 my $quality_value_1 = <IN1>; # not needed | |
2522 # reading from the second input file | |
2523 my $identifier_2 = <IN2>; | |
2524 my $sequence_2 = <IN2>; | |
2525 my $ident_2 = <IN2>; # not needed | |
2526 my $quality_value_2 = <IN2>; # not needed | |
2527 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2); | |
2528 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces | |
2529 $identifier_2 = fix_IDs($identifier_2); | |
2530 | |
2531 ++$count; | |
2532 | |
2533 if ($skip){ | |
2534 next unless ($count > $skip); | |
2535 } | |
2536 if ($upto){ | |
2537 last if ($count > $upto); | |
2538 } | |
2539 | |
2540 $counting{sequences_count}++; | |
2541 if ($counting{sequences_count}%1000000==0) { | |
2542 warn "Processed $counting{sequences_count} sequence pairs so far\n"; | |
2543 } | |
2544 | |
2545 my $orig_identifier_1 = $identifier_1; | |
2546 my $orig_identifier_2 = $identifier_2; | |
2547 | |
2548 chomp $sequence_1; | |
2549 chomp $identifier_1; | |
2550 chomp $sequence_2; | |
2551 chomp $identifier_2; | |
2552 chomp $quality_value_1; | |
2553 chomp $quality_value_2; | |
2554 | |
2555 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID | |
2556 | |
2557 my $return; | |
2558 if ($bowtie2){ | |
2559 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); | |
2560 } | |
2561 else{ | |
2562 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2); | |
2563 } | |
2564 | |
2565 unless ($return){ | |
2566 $return = 0; | |
2567 } | |
2568 | |
2569 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified | |
2570 if ($ambiguous and $return == 2){ | |
2571 # seq_1 | |
2572 print AMBIG_1 $orig_identifier_1; | |
2573 print AMBIG_1 "$sequence_1\n"; | |
2574 print AMBIG_1 $ident_1; | |
2575 print AMBIG_1 "$quality_value_1\n"; | |
2576 # seq_2 | |
2577 print AMBIG_2 $orig_identifier_2; | |
2578 print AMBIG_2 "$sequence_2\n"; | |
2579 print AMBIG_2 $ident_2; | |
2580 print AMBIG_2 "$quality_value_2\n"; | |
2581 } | |
2582 | |
2583 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified | |
2584 elsif ($unmapped and $return == 1){ | |
2585 # seq_1 | |
2586 print UNMAPPED_1 $orig_identifier_1; | |
2587 print UNMAPPED_1 "$sequence_1\n"; | |
2588 print UNMAPPED_1 $ident_1; | |
2589 print UNMAPPED_1 "$quality_value_1\n"; | |
2590 # seq_2 | |
2591 print UNMAPPED_2 $orig_identifier_2; | |
2592 print UNMAPPED_2 "$sequence_2\n"; | |
2593 print UNMAPPED_2 $ident_2; | |
2594 print UNMAPPED_2 "$quality_value_2\n"; | |
2595 } | |
2596 } | |
2597 | |
2598 warn "Processed $counting{sequences_count} sequences in total\n\n"; | |
2599 | |
2600 close OUT or warn "Failed to close filehandle OUT: $!\n\n"; | |
2601 if ($ambig_bam){ | |
2602 close AMBIBAM or warn "Had trouble closing filehandle AMBIBAM: $!\n\n"; | |
2603 } | |
2604 | |
2605 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid); | |
2606 | |
2607 } | |
2608 | |
2609 sub check_bowtie_results_single_end{ | |
2610 my ($sequence,$identifier,$quality_value) = @_; | |
2611 | |
2612 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout | |
2613 $quality_value = 'I'x(length$sequence); | |
2614 } | |
2615 | |
2616 my %mismatches = (); | |
2617 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome | |
2618 foreach my $index (0..$#fhs){ | |
2619 | |
2620 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
2621 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); | |
2622 ### if the sequence we are currently looking at produced an alignment we are doing various things with it | |
2623 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
2624 ############################################################### | |
2625 ### STEP I Now processing the alignment stored in last_line ### | |
2626 ############################################################### | |
2627 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier); | |
2628 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation | |
2629 ### we only continue to extract useful information about this alignment if 1 was returned | |
2630 if ($valid_alignment_found_1 == 1){ | |
2631 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself | |
2632 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) | |
2633 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; | |
2634 | |
2635 unless($mismatch_info){ | |
2636 $mismatch_info = ''; | |
2637 } | |
2638 | |
2639 chomp $mismatch_info; | |
2640 my $chromosome; | |
2641 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ | |
2642 $chromosome = $mapped_chromosome; | |
2643 } | |
2644 else{ | |
2645 die "Chromosome number extraction failed for $mapped_chromosome\n"; | |
2646 } | |
2647 ### Now extracting the number of mismatches to the converted genome | |
2648 my $number_of_mismatches; | |
2649 if ($mismatch_info eq ''){ | |
2650 $number_of_mismatches = 0; | |
2651 } | |
2652 elsif ($mismatch_info =~ /^\d/){ | |
2653 my @mismatches = split (/,/,$mismatch_info); | |
2654 $number_of_mismatches = scalar @mismatches; | |
2655 } | |
2656 else{ | |
2657 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n"; | |
2658 } | |
2659 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
2660 my $alignment_location = join (":",$chromosome,$position); | |
2661 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
2662 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same | |
2663 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index | |
2664 ### number for the found alignment) | |
2665 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ | |
2666 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; | |
2667 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; | |
2668 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; | |
2669 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; | |
2670 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; | |
2671 } | |
2672 $number_of_mismatches = undef; | |
2673 ################################################################################################################################################## | |
2674 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a | |
2675 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will | |
2676 ### be returned as $valid_alignment_found and it will then be processed in the next round only. | |
2677 ################################################################################################################################################## | |
2678 my $newline = $fhs[$index]->{fh}-> getline(); | |
2679 if ($newline){ | |
2680 my ($seq_id) = split (/\t/,$newline); | |
2681 $fhs[$index]->{last_seq_id} = $seq_id; | |
2682 $fhs[$index]->{last_line} = $newline; | |
2683 } | |
2684 else { | |
2685 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) | |
2686 $fhs[$index]->{last_seq_id} = undef; | |
2687 $fhs[$index]->{last_line} = undef; | |
2688 next; | |
2689 } | |
2690 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier); | |
2691 ### we only continue to extract useful information about this second alignment if 1 was returned | |
2692 if ($valid_alignment_found_2 == 1){ | |
2693 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself | |
2694 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse) | |
2695 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7]; | |
2696 unless($mismatch_info){ | |
2697 $mismatch_info = ''; | |
2698 } | |
2699 chomp $mismatch_info; | |
2700 | |
2701 my $chromosome; | |
2702 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ | |
2703 $chromosome = $mapped_chromosome; | |
2704 } | |
2705 else{ | |
2706 die "Chromosome number extraction failed for $mapped_chromosome\n"; | |
2707 } | |
2708 | |
2709 ### Now extracting the number of mismatches to the converted genome | |
2710 my $number_of_mismatches; | |
2711 if ($mismatch_info eq ''){ | |
2712 $number_of_mismatches = 0; | |
2713 } | |
2714 elsif ($mismatch_info =~ /^\d/){ | |
2715 my @mismatches = split (/,/,$mismatch_info); | |
2716 $number_of_mismatches = scalar @mismatches; | |
2717 } | |
2718 else{ | |
2719 die "Something weird is going on with the mismatch field\n"; | |
2720 } | |
2721 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
2722 ### extracting the chromosome number from the bowtie output (see above) | |
2723 my $alignment_location = join (":",$chromosome,$position); | |
2724 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position | |
2725 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this | |
2726 ### case we are not writing the same entry out a second time. | |
2727 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){ | |
2728 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id; | |
2729 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence; | |
2730 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index; | |
2731 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome; | |
2732 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position; | |
2733 } | |
2734 #################################################################################################################################### | |
2735 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ### | |
2736 #################################################################################################################################### | |
2737 $newline = $fhs[$index]->{fh}-> getline(); | |
2738 if ($newline){ | |
2739 my ($seq_id) = split (/\t/,$newline); | |
2740 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier); | |
2741 $fhs[$index]->{last_seq_id} = $seq_id; | |
2742 $fhs[$index]->{last_line} = $newline; | |
2743 next; | |
2744 } | |
2745 else { | |
2746 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output) | |
2747 $fhs[$index]->{last_seq_id} = undef; | |
2748 $fhs[$index]->{last_line} = undef; | |
2749 next; | |
2750 } | |
2751 ### still within the 2nd sequence in correct orientation found | |
2752 } | |
2753 ### still withing the 1st sequence in correct orientation found | |
2754 } | |
2755 ### still within the if (last_seq_id eq identifier) condition | |
2756 } | |
2757 ### still within foreach index loop | |
2758 } | |
2759 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file | |
2760 unless(%mismatches){ | |
2761 $counting{no_single_alignment_found}++; | |
2762 if ($unmapped){ | |
2763 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified | |
2764 } | |
2765 else{ | |
2766 return; | |
2767 } | |
2768 } | |
2769 ####################################################################################################################################################### | |
2770 ####################################################################################################################################################### | |
2771 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ### | |
2772 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ### | |
2773 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ### | |
2774 ####################################################################################################################################################### | |
2775 ####################################################################################################################################################### | |
2776 ### Going to use the variable $sequence_fails as a memory if a sequence could not be aligned uniquely (set to 1 then) | |
2777 my $sequence_fails = 0; | |
2778 ### Declaring an empty hash reference which will store all information we need for the methylation call | |
2779 my $methylation_call_params; # hash reference! | |
2780 ### sorting in ascending order | |
2781 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ | |
2782 | |
2783 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment | |
2784 if (scalar keys %{$mismatches{$mismatch_number}} == 1){ | |
2785 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ | |
2786 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence}; | |
2787 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; | |
2788 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position}; | |
2789 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; | |
2790 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; | |
2791 } | |
2792 } | |
2793 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){ | |
2794 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and | |
2795 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a | |
2796 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome | |
2797 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite | |
2798 ### reaction. E.g. | |
2799 ### CAGTCACGCGCGCGCG will become | |
2800 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition. | |
2801 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave | |
2802 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!: | |
2803 ### G->A conversion: | |
2804 ### highly methylated: CAATCACACACACACA | |
2805 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce | |
2806 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the | |
2807 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted | |
2808 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts. | |
2809 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions | |
2810 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of | |
2811 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment. | |
2812 ### In the above example the number of transliterations required to transform the actual sequence | |
2813 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment) | |
2814 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments) | |
2815 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments | |
2816 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed | |
2817 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be | |
2818 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded. | |
2819 my @three_candidate_seqs; | |
2820 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){ | |
2821 my $transliterations_performed; | |
2822 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){ | |
2823 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT'); | |
2824 } | |
2825 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){ | |
2826 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA'); | |
2827 } | |
2828 else{ | |
2829 die "unexpected index number range $!\n"; | |
2830 } | |
2831 push @three_candidate_seqs,{ | |
2832 index =>$mismatches{$mismatch_number}->{$composite_location}->{index}, | |
2833 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence}, | |
2834 mismatch_number => $mismatch_number, | |
2835 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome}, | |
2836 position => $mismatches{$mismatch_number}->{$composite_location}->{position}, | |
2837 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id}, | |
2838 transliterations_performed => $transliterations_performed, | |
2839 }; | |
2840 } | |
2841 ### sorting in ascending order for the lowest number of transliterations performed | |
2842 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs; | |
2843 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed}; | |
2844 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed}; | |
2845 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed}; | |
2846 # print "$first_array_element\t$second_array_element\t$third_array_element\n"; | |
2847 if (($first_array_element*2) < $second_array_element){ | |
2848 $counting{low_complexity_alignments_overruled_count}++; | |
2849 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits | |
2850 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence}; | |
2851 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome}; | |
2852 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position}; | |
2853 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index}; | |
2854 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number; | |
2855 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n"; | |
2856 } | |
2857 else{ | |
2858 $sequence_fails = 1; | |
2859 } | |
2860 } | |
2861 else{ | |
2862 $sequence_fails = 1; | |
2863 } | |
2864 ### after processing the alignment with the lowest number of mismatches we exit | |
2865 last; | |
2866 } | |
2867 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions | |
2868 if ($sequence_fails == 1){ | |
2869 $counting{unsuitable_sequence_count}++; | |
2870 if ($ambiguous){ | |
2871 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified | |
2872 } | |
2873 if ($unmapped){ | |
2874 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified | |
2875 } | |
2876 else{ | |
2877 return 0; # => exits to next sequence (default) | |
2878 } | |
2879 } | |
2880 | |
2881 ### --DIRECTIONAL | |
2882 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
2883 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
2884 if ($directional){ | |
2885 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ | |
2886 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
2887 $counting{alignments_rejected_count}++; | |
2888 return 0; | |
2889 } | |
2890 } | |
2891 | |
2892 ### If the sequence has not been rejected so far it will have a unique best alignment | |
2893 $counting{unique_best_alignment_count}++; | |
2894 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params); | |
2895 | |
2896 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call | |
2897 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ | |
2898 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; | |
2899 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
2900 return 0; | |
2901 } | |
2902 | |
2903 ### otherwise we are set to perform the actual methylation call | |
2904 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); | |
2905 | |
2906 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value); | |
2907 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out | |
2908 } | |
2909 | |
2910 sub check_bowtie_results_single_end_bowtie2{ | |
2911 my ($sequence,$identifier,$quality_value) = @_; | |
2912 | |
2913 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout | |
2914 $quality_value = 'I'x(length$sequence); | |
2915 } | |
2916 | |
2917 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name. | |
2918 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs | |
2919 # print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n"; | |
2920 | |
2921 my $alignment_ambiguous = 0; | |
2922 my $first_ambig_alignment; # storing the first ambiguous alignment so it can be written out in case '--ambig_bam' was specified | |
2923 my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far | |
2924 my $amb_same_thread = 0; ## if a reads primary and secondary alignments have the same alignment score we set this to true. | |
2925 | |
2926 my %alignments = (); | |
2927 | |
2928 ### reading from the Bowtie 2 output filehandles | |
2929 foreach my $index (0..$#fhs){ | |
2930 # print "Index: $index\n"; | |
2931 # print "$fhs[$index]->{last_line}\n"; | |
2932 # print "$fhs[$index]->{last_seq_id}\n"; | |
2933 # sleep (1); | |
2934 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
2935 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id}); | |
2936 | |
2937 ### if the sequence we are currently looking at produced an alignment we are doing various things with it | |
2938 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n"; | |
2939 | |
2940 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
2941 # SAM format specifications for Bowtie 2 | |
2942 # (1) Name of read that aligned | |
2943 # (2) Sum of all applicable flags. Flags relevant to Bowtie are: | |
2944 # 1 The read is one of a pair | |
2945 # 2 The alignment is one end of a proper paired-end alignment | |
2946 # 4 The read has no reported alignments | |
2947 # 8 The read is one of a pair and has no reported alignments | |
2948 # 16 The alignment is to the reverse reference strand | |
2949 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand | |
2950 # 64 The read is mate 1 in a pair | |
2951 # 128 The read is mate 2 in a pair | |
2952 # 256 The read has multiple mapping states | |
2953 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) | |
2954 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) | |
2955 # (5) Mapping quality (255 means MAPQ is not available) | |
2956 # (6) CIGAR string representation of alignment (* if unavailable) | |
2957 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. | |
2958 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. | |
2959 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. | |
2960 # (10) Read sequence (reverse-complemented if aligned to the reverse strand) | |
2961 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. | |
2962 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: | |
2963 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. | |
2964 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. | |
2965 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. | |
2966 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. | |
2967 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read. | |
2968 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
2969 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
2970 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. | |
2971 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. | |
2972 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. | |
2973 | |
2974 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10]; | |
2975 | |
2976 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance | |
2977 if ($flag == 4){ | |
2978 ## reading in the next alignment, which must be the next sequence | |
2979 my $newline = $fhs[$index]->{fh}-> getline(); | |
2980 if ($newline){ | |
2981 chomp $newline; | |
2982 my ($seq_id) = split (/\t/,$newline); | |
2983 $fhs[$index]->{last_seq_id} = $seq_id; | |
2984 $fhs[$index]->{last_line} = $newline; | |
2985 if ($seq_id eq $identifier){ | |
2986 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; | |
2987 } | |
2988 next; # next instance | |
2989 } | |
2990 else{ | |
2991 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
2992 $fhs[$index]->{last_seq_id} = undef; | |
2993 $fhs[$index]->{last_line} = undef; | |
2994 next; | |
2995 } | |
2996 } | |
2997 | |
2998 # if there are one or more proper alignments we can extract the chromosome number | |
2999 my $chromosome; | |
3000 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){ | |
3001 $chromosome = $mapped_chromosome; | |
3002 } | |
3003 else{ | |
3004 die "Chromosome number extraction failed for $mapped_chromosome\n"; | |
3005 } | |
3006 | |
3007 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string | |
3008 my ($alignment_score,$second_best,$MD_tag); | |
3009 my @fields = split (/\t/,$fhs[$index]->{last_line}); | |
3010 | |
3011 foreach (11..$#fields){ | |
3012 if ($fields[$_] =~ /AS:i:(.*)/){ | |
3013 $alignment_score = $1; | |
3014 } | |
3015 elsif ($fields[$_] =~ /XS:i:(.*)/){ | |
3016 $second_best = $1; | |
3017 } | |
3018 elsif ($fields[$_] =~ /MD:Z:(.*)/){ | |
3019 $MD_tag = $1; | |
3020 } | |
3021 } | |
3022 | |
3023 my $overwrite = 0; # If we get 2 alignments to the very same position, e.g. to OT with and AS of -156 and to CTOB with and AS of 0 we need the latter to trump the former, else | |
3024 # the read will be assigned to the wrong strand which may result in incorrect methylation calls. | |
3025 # this was brought to our attention by Sylvain Foret (ANU Canberra), 13 April 2016 | |
3026 | |
3027 if (!defined $best_AS_so_far){ | |
3028 $best_AS_so_far = $alignment_score; | |
3029 $overwrite++; | |
3030 # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n"; | |
3031 if ($ambig_bam){ # also setting the first_ambig_alignment | |
3032 $first_ambig_alignment = $fhs[$index]->{last_line}; | |
3033 $first_ambig_alignment =~ s/_(CT|GA)_converted//; | |
3034 # warn "$first_ambig_alignment\n"; sleep(1); | |
3035 } | |
3036 } | |
3037 else{ | |
3038 if ($alignment_score >= $best_AS_so_far){ # AS are generally negative with a maximum of 0; | |
3039 # 19 07 2016: changed this to >= so that equally good alignments are also added. Ambiguous alignments from different threads will be identified later on | |
3040 $best_AS_so_far = $alignment_score; | |
3041 $overwrite++; | |
3042 # warn "Found better or equal alignment score ($alignment_score), setting \$best_AS_so_far to $best_AS_so_far\n"; | |
3043 | |
3044 # 22 07 2016: resetting the ambiguous score within same thread only if the current alignment is really better than the previous one | |
3045 if ($alignment_score > $best_AS_so_far){ | |
3046 # warn "Resetting amb within thread value to 0\n"; | |
3047 $amb_same_thread = 0; | |
3048 | |
3049 if ($ambig_bam){ # also setting a new first_ambig_alignment | |
3050 $first_ambig_alignment = $fhs[$index]->{last_line}; | |
3051 $first_ambig_alignment =~ s/_(CT|GA)_converted//; | |
3052 # warn "$first_ambig_alignment\n"; sleep(1); | |
3053 } | |
3054 } | |
3055 } | |
3056 else{ | |
3057 # warn "Current alignment (AS $alignment_score) isn't better than the best so far ($best_AS_so_far). Not changing anything\n"; | |
3058 } | |
3059 } | |
3060 | |
3061 # warn "First best alignment_score is: '$alignment_score'\n"; | |
3062 # warn "MD tag is: '$MD_tag'\n"; | |
3063 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag) from line $fhs[$index]->{last_line}!\n" unless (defined $alignment_score and defined $MD_tag); | |
3064 | |
3065 if (defined $second_best){ | |
3066 # warn "second best alignment_score is: '$second_best'\n\n"; | |
3067 | |
3068 # If the first alignment score is the same as the alignment score of the second best hit we keep a memory of this | |
3069 if ($alignment_score == $second_best){ | |
3070 | |
3071 # checking to see if this read produced the best alignment | |
3072 if ($alignment_score == $best_AS_so_far){ # yes this read is the best one so far, however it is ambiguous | |
3073 # warn "Read is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n"; | |
3074 $amb_same_thread = 1; | |
3075 } | |
3076 else{ | |
3077 # warn "This read has a worse alignments score than the best alignment so far and will be ignored even though it is ambiguous in itself\n"; | |
3078 } | |
3079 | |
3080 ### if there is a better alignment later on -> fine. If not, the read will get booted altogether | |
3081 | |
3082 ## need to read and discard all additional ambiguous reads until we reach the next sequence | |
3083 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
3084 my $newline = $fhs[$index]->{fh}-> getline(); | |
3085 if ($newline){ | |
3086 chomp $newline; | |
3087 my ($seq_id) = split (/\t/,$newline); | |
3088 $fhs[$index]->{last_seq_id} = $seq_id; | |
3089 $fhs[$index]->{last_line} = $newline; | |
3090 } | |
3091 else{ | |
3092 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
3093 $fhs[$index]->{last_seq_id} = undef; | |
3094 $fhs[$index]->{last_line} = undef; | |
3095 last; # break free in case we have reached the end of the alignment output | |
3096 } | |
3097 } | |
3098 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; | |
3099 } | |
3100 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment | |
3101 | |
3102 my $alignment_location = join (":",$chromosome,$position); | |
3103 | |
3104 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
3105 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
3106 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
3107 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB | |
3108 | |
3109 if ($overwrite){ | |
3110 $alignments{$alignment_location}->{seq_id} = $id; | |
3111 $alignments{$alignment_location}->{alignment_score} = $alignment_score; | |
3112 $alignments{$alignment_location}->{alignment_score_second_best} = $second_best; | |
3113 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; | |
3114 $alignments{$alignment_location}->{index} = $index; | |
3115 $alignments{$alignment_location}->{chromosome} = $chromosome; | |
3116 $alignments{$alignment_location}->{position} = $position; | |
3117 $alignments{$alignment_location}->{CIGAR} = $cigar; | |
3118 $alignments{$alignment_location}->{MD_tag} = $MD_tag; | |
3119 } | |
3120 | |
3121 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence | |
3122 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
3123 my $newline = $fhs[$index]->{fh}-> getline(); | |
3124 if ($newline){ | |
3125 chomp $newline; | |
3126 my ($seq_id) = split (/\t/,$newline); | |
3127 $fhs[$index]->{last_seq_id} = $seq_id; | |
3128 $fhs[$index]->{last_line} = $newline; | |
3129 } | |
3130 else{ | |
3131 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
3132 $fhs[$index]->{last_seq_id} = undef; | |
3133 $fhs[$index]->{last_line} = undef; | |
3134 last; # break free in case we have reached the end of the alignment output | |
3135 } | |
3136 } | |
3137 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; | |
3138 } | |
3139 } | |
3140 else{ # there is no second best hit, so we can just store this one and read in the next sequence | |
3141 | |
3142 my $alignment_location = join (":",$chromosome,$position); | |
3143 # warn "There is no second best hit. Overwrite status: $overwrite\n"; | |
3144 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
3145 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
3146 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
3147 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB | |
3148 | |
3149 if ($overwrite){ | |
3150 $alignments{$alignment_location}->{seq_id} = $id; | |
3151 $alignments{$alignment_location}->{alignment_score} = $alignment_score; | |
3152 $alignments{$alignment_location}->{alignment_score_second_best} = undef; | |
3153 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence; | |
3154 $alignments{$alignment_location}->{index} = $index; | |
3155 $alignments{$alignment_location}->{chromosome} = $chromosome; | |
3156 $alignments{$alignment_location}->{position} = $position; | |
3157 $alignments{$alignment_location}->{MD_tag} = $MD_tag; | |
3158 $alignments{$alignment_location}->{CIGAR} = $cigar; | |
3159 } | |
3160 | |
3161 my $newline = $fhs[$index]->{fh}-> getline(); | |
3162 if ($newline){ | |
3163 chomp $newline; | |
3164 my ($seq_id) = split (/\t/,$newline); | |
3165 $fhs[$index]->{last_seq_id} = $seq_id; | |
3166 $fhs[$index]->{last_line} = $newline; | |
3167 if ($seq_id eq $identifier){ | |
3168 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; | |
3169 } | |
3170 } | |
3171 else{ | |
3172 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
3173 $fhs[$index]->{last_seq_id} = undef; | |
3174 $fhs[$index]->{last_line} = undef; | |
3175 } | |
3176 } | |
3177 } | |
3178 } | |
3179 | |
3180 ### If there were several equally good alignments for the best alignment score we will boot the read | |
3181 if ($amb_same_thread){ | |
3182 $alignment_ambiguous = 1; | |
3183 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; | |
3184 } | |
3185 else{ | |
3186 # warn "alignment won't be considered ambiguous. This time....\n"; | |
3187 } | |
3188 | |
3189 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out. | |
3190 if ($alignment_ambiguous == 1){ | |
3191 $counting{unsuitable_sequence_count}++; | |
3192 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
3193 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); | |
3194 # print "$ambiguous_read_output\n"; | |
3195 | |
3196 if ($ambig_bam){ | |
3197 # warn "Sequence is ambiguous, printing out BAM file:\n"; | |
3198 print AMBIBAM "$first_ambig_alignment\n"; | |
3199 } | |
3200 | |
3201 if ($ambiguous){ | |
3202 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified | |
3203 } | |
3204 elsif ($unmapped){ | |
3205 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified | |
3206 } | |
3207 else{ | |
3208 return 0; | |
3209 } | |
3210 } | |
3211 | |
3212 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file | |
3213 unless(%alignments){ | |
3214 $counting{no_single_alignment_found}++; | |
3215 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value); | |
3216 # print "$unmapped_read_output\n"; | |
3217 if ($unmapped){ | |
3218 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified | |
3219 } | |
3220 else{ | |
3221 return 0; # default | |
3222 } | |
3223 } | |
3224 | |
3225 ####################################################################################################################################################### | |
3226 | |
3227 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one | |
3228 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) | |
3229 ### alignment score we are discarding the sequence altogether. | |
3230 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for | |
3231 ### opening (5) and extending (3 per bp) the gap. | |
3232 | |
3233 ####################################################################################################################################################### | |
3234 | |
3235 my $methylation_call_params; # hash reference which will store all information we need for the methylation call | |
3236 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
3237 | |
3238 ### print contents of %alignments for debugging | |
3239 # if (scalar keys %alignments > 1){ | |
3240 # print "\n******\n"; | |
3241 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ | |
3242 # print "Loc: $alignment_location\n"; | |
3243 # print "ID: $alignments{$alignment_location}->{seq_id}\n"; | |
3244 # print "AS: $alignments{$alignment_location}->{alignment_score}\n"; | |
3245 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n"; | |
3246 # print "Index $alignments{$alignment_location}->{index}\n"; | |
3247 # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; | |
3248 # print "pos: $alignments{$alignment_location}->{position}\n"; | |
3249 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n"; | |
3250 # } | |
3251 # print "\n******\n"; | |
3252 # } | |
3253 | |
3254 ### if there is only 1 entry in the hash with we accept it as the best alignment | |
3255 if (scalar keys %alignments == 1){ | |
3256 for my $unique_best_alignment (keys %alignments){ | |
3257 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence}; | |
3258 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; | |
3259 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position}; | |
3260 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; | |
3261 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score}; | |
3262 $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$unique_best_alignment}->{alignment_score_second_best}; | |
3263 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag}; | |
3264 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR}; | |
3265 } | |
3266 } | |
3267 | |
3268 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case | |
3269 ### we boot the sequence altogether | |
3270 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ | |
3271 my $best_alignment_score; | |
3272 my $best_alignment_location; | |
3273 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){ | |
3274 # print "$alignments{$alignment_location}->{alignment_score}\n"; | |
3275 unless (defined $best_alignment_score){ | |
3276 $best_alignment_score = $alignments{$alignment_location}->{alignment_score}; | |
3277 $best_alignment_location = $alignment_location; | |
3278 # print "setting best alignment score: $best_alignment_score\n"; | |
3279 } | |
3280 else{ | |
3281 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted | |
3282 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){ | |
3283 # warn "Same alignment score, the sequence will get booted!\n"; | |
3284 $sequence_fails = 1; | |
3285 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments | |
3286 } | |
3287 ### else we are going to store the best alignment for further processing | |
3288 else{ | |
3289 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence}; | |
3290 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; | |
3291 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position}; | |
3292 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; | |
3293 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score}; | |
3294 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag}; | |
3295 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR}; | |
3296 if (defined $alignments{$best_alignment_location}->{alignment_score_second_best} and $alignments{$best_alignment_location}-> {alignment_score_second_best} > $alignments{$alignment_location}->{alignment_score}) { | |
3297 $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$best_alignment_location}->{alignment_score_second_best}; | |
3298 } | |
3299 else { | |
3300 $methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$alignment_location}->{alignment_score}; | |
3301 } | |
3302 last; # exiting after processing the second alignment since the sequence produced a unique best alignment | |
3303 } | |
3304 } | |
3305 } | |
3306 } | |
3307 else{ | |
3308 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";; | |
3309 } | |
3310 | |
3311 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions | |
3312 if ($sequence_fails == 1){ | |
3313 $counting{unsuitable_sequence_count}++; | |
3314 | |
3315 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
3316 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value); | |
3317 # print OUT "$ambiguous_read_output\n"; | |
3318 | |
3319 if ($ambiguous){ | |
3320 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified | |
3321 } | |
3322 elsif ($unmapped){ | |
3323 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified | |
3324 } | |
3325 else{ | |
3326 return 0; # => exits to next sequence (default) | |
3327 } | |
3328 } | |
3329 | |
3330 ### --DIRECTIONAL | |
3331 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
3332 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
3333 if ($directional){ | |
3334 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){ | |
3335 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
3336 $counting{alignments_rejected_count}++; | |
3337 return 0; | |
3338 } | |
3339 } | |
3340 | |
3341 ### If the sequence has not been rejected so far it has a unique best alignment | |
3342 $counting{unique_best_alignment_count}++; | |
3343 | |
3344 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well | |
3345 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params); | |
3346 | |
3347 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call | |
3348 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){ | |
3349 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n"; | |
3350 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
3351 return 0; | |
3352 } | |
3353 | |
3354 # Compute MAPQ value | |
3355 $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence), undef, | |
3356 $methylation_call_params->{$identifier}->{alignment_score}, | |
3357 $methylation_call_params->{$identifier}->{alignment_score_second_best}); | |
3358 | |
3359 | |
3360 | |
3361 ### otherwise we are set to perform the actual methylation call | |
3362 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion}); | |
3363 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value); | |
3364 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out | |
3365 } | |
3366 | |
3367 | |
3368 sub determine_number_of_transliterations_performed{ | |
3369 my ($sequence,$read_conversion) = @_; | |
3370 my $number_of_transliterations; | |
3371 if ($read_conversion eq 'CT'){ | |
3372 $number_of_transliterations = $sequence =~ tr/C/T/; | |
3373 } | |
3374 elsif ($read_conversion eq 'GA'){ | |
3375 $number_of_transliterations = $sequence =~ tr/G/A/; | |
3376 } | |
3377 else{ | |
3378 die "Read conversion mode of the read was not specified $!\n"; | |
3379 } | |
3380 return $number_of_transliterations; | |
3381 } | |
3382 | |
3383 sub decide_whether_single_end_alignment_is_valid{ | |
3384 my ($index,$identifier) = @_; | |
3385 | |
3386 # extracting from Bowtie 1 format | |
3387 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1]; | |
3388 | |
3389 ### ensuring that the entry is the correct sequence | |
3390 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){ | |
3391 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically | |
3392 ### sensible alignments | |
3393 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); | |
3394 ### If the orientation was correct can we move on | |
3395 if ($orientation == 1){ | |
3396 return 1; ### 1st possibility for a sequence to pass | |
3397 } | |
3398 ### If the alignment was in the wrong orientation we need to read in a new line | |
3399 elsif($orientation == 0){ | |
3400 my $newline = $fhs[$index]->{fh}->getline(); | |
3401 if ($newline){ | |
3402 ($id,$strand) = (split (/\t/,$newline))[0,1]; | |
3403 | |
3404 ### ensuring that the next entry is still the correct sequence | |
3405 if ($id eq $identifier){ | |
3406 ### checking orientation again | |
3407 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand); | |
3408 ### If the orientation was correct can we move on | |
3409 if ($orientation == 1){ | |
3410 $fhs[$index]->{last_seq_id} = $id; | |
3411 $fhs[$index]->{last_line} = $newline; | |
3412 return 1; ### 2nd possibility for a sequence to pass | |
3413 } | |
3414 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs | |
3415 elsif ($orientation == 0){ | |
3416 $newline = $fhs[$index]->{fh}->getline(); | |
3417 if ($newline){ | |
3418 my ($seq_id) = split (/\t/,$newline); | |
3419 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with | |
3420 ### the same fields of the just read next entry | |
3421 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier); | |
3422 $fhs[$index]->{last_seq_id} = $seq_id; | |
3423 $fhs[$index]->{last_line} = $newline; | |
3424 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation | |
3425 } | |
3426 else{ | |
3427 # assigning undef to last_seq_id and last_line (end of bowtie output) | |
3428 $fhs[$index]->{last_seq_id} = undef; | |
3429 $fhs[$index]->{last_line} = undef; | |
3430 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation | |
3431 } | |
3432 } | |
3433 else{ | |
3434 die "The orientation of the alignment must be either correct or incorrect\n"; | |
3435 } | |
3436 } | |
3437 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs | |
3438 else{ | |
3439 $fhs[$index]->{last_seq_id} = $id; | |
3440 $fhs[$index]->{last_line} = $newline; | |
3441 return 0; # processing the new alignment result only in the next round | |
3442 } | |
3443 } | |
3444 else { | |
3445 # assigning undef to last_seq_id and last_line (end of bowtie output) | |
3446 $fhs[$index]->{last_seq_id} = undef; | |
3447 $fhs[$index]->{last_line} = undef; | |
3448 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation | |
3449 } | |
3450 } | |
3451 else{ | |
3452 die "The orientation of the alignment must be either correct or incorrect\n"; | |
3453 } | |
3454 } | |
3455 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round | |
3456 else{ | |
3457 return 0; | |
3458 } | |
3459 } | |
3460 ######################### | |
3461 ### BOWTIE 1 | PAIRED-END | |
3462 ######################### | |
3463 | |
3464 sub check_bowtie_results_paired_ends{ | |
3465 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; | |
3466 | |
3467 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 | |
3468 unless ($quality_value_1){ | |
3469 $quality_value_1 = 'I'x(length$sequence_1); | |
3470 } | |
3471 unless ($quality_value_2){ | |
3472 $quality_value_2 = 'I'x(length$sequence_2); | |
3473 } | |
3474 | |
3475 # warn "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; | |
3476 # sleep (1); | |
3477 my %mismatches = (); | |
3478 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome | |
3479 | |
3480 | |
3481 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. | |
3482 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). | |
3483 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) | |
3484 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary | |
3485 ### strands are not being reported by specifying --directional | |
3486 | |
3487 foreach my $index (0,3,1,2){ | |
3488 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
3489 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); | |
3490 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it | |
3491 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
3492 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n"; | |
3493 | |
3494 ################################################################################## | |
3495 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ### | |
3496 ################################################################################## | |
3497 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); | |
3498 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong | |
3499 ### orientation. We only continue to extract useful information about this alignment if 1 was returned | |
3500 if ($valid_alignment_found == 1){ | |
3501 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself. | |
3502 ### we store the useful information in %mismatches | |
3503 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; | |
3504 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; | |
3505 chomp $mismatch_info_1; | |
3506 chomp $mismatch_info_2; | |
3507 | |
3508 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted | |
3509 my ($chromosome_1,$chromosome_2); | |
3510 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ | |
3511 $chromosome_1 = $mapped_chromosome_1; | |
3512 } | |
3513 else{ | |
3514 die "Chromosome number extraction failed for $mapped_chromosome_1\n"; | |
3515 } | |
3516 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ | |
3517 $chromosome_2 = $mapped_chromosome_2; | |
3518 } | |
3519 else{ | |
3520 die "Chromosome number extraction failed for $mapped_chromosome_2\n"; | |
3521 } | |
3522 | |
3523 ### Now extracting the number of mismatches to the converted genome | |
3524 my $number_of_mismatches_1; | |
3525 my $number_of_mismatches_2; | |
3526 if ($mismatch_info_1 eq ''){ | |
3527 $number_of_mismatches_1 = 0; | |
3528 } | |
3529 elsif ($mismatch_info_1 =~ /^\d/){ | |
3530 my @mismatches = split (/,/,$mismatch_info_1); | |
3531 $number_of_mismatches_1 = scalar @mismatches; | |
3532 } | |
3533 else{ | |
3534 die "Something weird is going on with the mismatch field\n"; | |
3535 } | |
3536 if ($mismatch_info_2 eq ''){ | |
3537 $number_of_mismatches_2 = 0; | |
3538 } | |
3539 elsif ($mismatch_info_2 =~ /^\d/){ | |
3540 my @mismatches = split (/,/,$mismatch_info_2); | |
3541 $number_of_mismatches_2 = scalar @mismatches; | |
3542 } | |
3543 else{ | |
3544 die "Something weird is going on with the mismatch field\n"; | |
3545 } | |
3546 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments | |
3547 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; | |
3548 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
3549 die "Position 1 is higher than position 2" if ($position_1 > $position_2); | |
3550 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); | |
3551 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
3552 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
3553 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same | |
3554 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index | |
3555 ### number for the found alignment) | |
3556 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ | |
3557 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine | |
3558 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; | |
3559 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; | |
3560 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; | |
3561 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine | |
3562 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; | |
3563 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; | |
3564 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; | |
3565 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; | |
3566 } | |
3567 ################################################################################################################################################### | |
3568 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ### | |
3569 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ### | |
3570 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ### | |
3571 ### this round ### | |
3572 ################################################################################################################################################### | |
3573 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
3574 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
3575 | |
3576 if ($newline_1 and $newline_2){ | |
3577 my ($seq_id_1) = split (/\t/,$newline_1); | |
3578 my ($seq_id_2) = split (/\t/,$newline_2); | |
3579 | |
3580 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
3581 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
3582 } | |
3583 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
3584 $fhs[$index]->{last_seq_id} = $seq_id_2; | |
3585 } | |
3586 else{ | |
3587 die "Either read 1 or read 2 needs to end on '/1'\n"; | |
3588 } | |
3589 | |
3590 $fhs[$index]->{last_line_1} = $newline_1; | |
3591 $fhs[$index]->{last_line_2} = $newline_2; | |
3592 } | |
3593 else { | |
3594 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) | |
3595 $fhs[$index]->{last_seq_id} = undef; | |
3596 $fhs[$index]->{last_line_1} = undef; | |
3597 $fhs[$index]->{last_line_2} = undef; | |
3598 next; # jumping to the next index | |
3599 } | |
3600 ### Now processing the entry we just stored in last_line_1 and last_line_2 | |
3601 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier); | |
3602 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to | |
3603 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation | |
3604 if ($valid_alignment_found == 1){ | |
3605 ### we store the useful information in %mismatches | |
3606 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7]; | |
3607 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7]; | |
3608 chomp $mismatch_info_1; | |
3609 chomp $mismatch_info_2; | |
3610 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted) | |
3611 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ | |
3612 $chromosome_1 = $mapped_chromosome_1; | |
3613 } | |
3614 else{ | |
3615 die "Chromosome number extraction failed for $mapped_chromosome_1\n"; | |
3616 } | |
3617 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ | |
3618 $chromosome_2 = $mapped_chromosome_2; | |
3619 } | |
3620 else{ | |
3621 die "Chromosome number extraction failed for $mapped_chromosome_2\n"; | |
3622 } | |
3623 | |
3624 $number_of_mismatches_1=''; | |
3625 $number_of_mismatches_2=''; | |
3626 ### Now extracting the number of mismatches to the converted genome | |
3627 if ($mismatch_info_1 eq ''){ | |
3628 $number_of_mismatches_1 = 0; | |
3629 } | |
3630 elsif ($mismatch_info_1 =~ /^\d/){ | |
3631 my @mismatches = split (/,/,$mismatch_info_1); | |
3632 $number_of_mismatches_1 = scalar @mismatches; | |
3633 } | |
3634 else{ | |
3635 die "Something weird is going on with the mismatch field\n"; | |
3636 } | |
3637 if ($mismatch_info_2 eq ''){ | |
3638 $number_of_mismatches_2 = 0; | |
3639 } | |
3640 elsif ($mismatch_info_2 =~ /^\d/){ | |
3641 my @mismatches = split (/,/,$mismatch_info_2); | |
3642 $number_of_mismatches_2 = scalar @mismatches; | |
3643 } | |
3644 else{ | |
3645 die "Something weird is going on with the mismatch field\n"; | |
3646 } | |
3647 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments | |
3648 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2; | |
3649 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table | |
3650 die "position 1 is greater than position 2" if ($position_1 > $position_2); | |
3651 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); | |
3652 $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
3653 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
3654 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same | |
3655 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index | |
3656 ### number for the found alignment) | |
3657 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){ | |
3658 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine | |
3659 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1; | |
3660 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2; | |
3661 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index; | |
3662 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine | |
3663 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1; | |
3664 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2; | |
3665 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1; | |
3666 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2; | |
3667 } | |
3668 ############################################################################################################################################### | |
3669 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ### | |
3670 ############################################################################################################################################### | |
3671 $newline_1 = $fhs[$index]->{fh}-> getline(); | |
3672 $newline_2 = $fhs[$index]->{fh}-> getline(); | |
3673 | |
3674 if ($newline_1 and $newline_2){ | |
3675 my ($seq_id_1) = split (/\t/,$newline_1); | |
3676 my ($seq_id_2) = split (/\t/,$newline_2); | |
3677 | |
3678 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
3679 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
3680 } | |
3681 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
3682 $fhs[$index]->{last_seq_id} = $seq_id_2; | |
3683 } | |
3684 $fhs[$index]->{last_line_1} = $newline_1; | |
3685 $fhs[$index]->{last_line_2} = $newline_2; | |
3686 } | |
3687 else { | |
3688 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output) | |
3689 $fhs[$index]->{last_seq_id} = undef; | |
3690 $fhs[$index]->{last_line_1} = undef; | |
3691 $fhs[$index]->{last_line_2} = undef; | |
3692 next; # jumping to the next index | |
3693 } | |
3694 ### within the 2nd sequence pair alignment in correct orientation found | |
3695 } | |
3696 ### within the 1st sequence pair alignment in correct orientation found | |
3697 } | |
3698 ### still within the (last_seq_id eq identifier) condition | |
3699 } | |
3700 ### still within foreach index loop | |
3701 } | |
3702 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file | |
3703 unless(%mismatches){ | |
3704 $counting{no_single_alignment_found}++; | |
3705 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified | |
3706 } | |
3707 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
3708 my $sequence_pair_fails = 0; | |
3709 ### Declaring an empty hash reference which will store all information we need for the methylation call | |
3710 my $methylation_call_params; # hash reference! | |
3711 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the | |
3712 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the | |
3713 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether | |
3714 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){ | |
3715 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n"; | |
3716 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){ | |
3717 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n"; | |
3718 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n"; | |
3719 } | |
3720 if (scalar keys %{$mismatches{$mismatch_number}} == 1){ | |
3721 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n"; | |
3722 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){ | |
3723 $methylation_call_params->{$identifier}->{seq_id} = $identifier; | |
3724 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1}; | |
3725 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}; | |
3726 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome}; | |
3727 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1}; | |
3728 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}; | |
3729 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2})); | |
3730 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index}; | |
3731 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1}; | |
3732 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2}; | |
3733 } | |
3734 } | |
3735 else{ | |
3736 $sequence_pair_fails = 1; | |
3737 } | |
3738 ### after processing the alignment with the lowest number of mismatches we exit | |
3739 last; | |
3740 } | |
3741 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions | |
3742 if ($sequence_pair_fails == 1){ | |
3743 $counting{unsuitable_sequence_count}++; | |
3744 if ($ambiguous){ | |
3745 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified | |
3746 } | |
3747 if ($unmapped){ | |
3748 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified | |
3749 } | |
3750 else{ | |
3751 return 0; # => exits to next sequence (default) | |
3752 } | |
3753 } | |
3754 | |
3755 ### --DIRECTIONAL | |
3756 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
3757 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
3758 if ($directional){ | |
3759 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ | |
3760 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
3761 $counting{alignments_rejected_count}++; | |
3762 return 0; | |
3763 } | |
3764 } | |
3765 | |
3766 ### If the sequence has not been rejected so far it does have a unique best alignment | |
3767 $counting{unique_best_alignment_count}++; | |
3768 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params); | |
3769 | |
3770 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call | |
3771 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ | |
3772 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n"; | |
3773 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
3774 return 0; | |
3775 } | |
3776 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ | |
3777 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n"; | |
3778 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
3779 return 0; | |
3780 } | |
3781 | |
3782 ### otherwise we are set to perform the actual methylation call | |
3783 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); | |
3784 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); | |
3785 | |
3786 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); | |
3787 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 | |
3788 } | |
3789 | |
3790 ######################### | |
3791 ### BOWTIE 2 | PAIRED-END | |
3792 ######################### | |
3793 | |
3794 sub check_bowtie_results_paired_ends_bowtie2{ | |
3795 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_; | |
3796 | |
3797 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40 | |
3798 unless ($quality_value_1){ | |
3799 $quality_value_1 = 'I'x(length$sequence_1); | |
3800 } | |
3801 | |
3802 unless ($quality_value_2){ | |
3803 $quality_value_2 = 'I'x(length$sequence_2); | |
3804 } | |
3805 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n"; | |
3806 | |
3807 my %alignments; | |
3808 my $alignment_ambiguous = 0; | |
3809 | |
3810 my $first_ambig_alignment_line1; # storing the first ambiguous alignment so it can be written out in case '--ambig_bam' was specified R1 | |
3811 my $first_ambig_alignment_line2; # R2 | |
3812 | |
3813 my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far | |
3814 my $amb_same_thread = 0; ## if a read's primary and secondary alignments have the same alignment score we set this to true. | |
3815 | |
3816 ### reading from the Bowtie 2 output filehandles | |
3817 | |
3818 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way. | |
3819 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2). | |
3820 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB) | |
3821 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary | |
3822 ### strands are not being reported when '--directional' is specified | |
3823 | |
3824 foreach my $index (0,3,1,2){ | |
3825 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output) | |
3826 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id}); | |
3827 | |
3828 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it | |
3829 if ($fhs[$index]->{last_seq_id} eq $identifier) { | |
3830 | |
3831 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10]; | |
3832 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10]; | |
3833 # print "Index: $index\t$fhs[$index]->{last_line_1}\n"; | |
3834 # print "Index: $index\t$fhs[$index]->{last_line_2}\n"; | |
3835 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n"; | |
3836 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n"; | |
3837 $id_1 =~ s/\/1$//; | |
3838 $id_2 =~ s/\/2$//; | |
3839 | |
3840 # SAM format specifications for Bowtie 2 | |
3841 # (1) Name of read that aligned | |
3842 # (2) Sum of all applicable flags. Flags relevant to Bowtie are: | |
3843 # 1 The read is one of a pair | |
3844 # 2 The alignment is one end of a proper paired-end alignment | |
3845 # 4 The read has no reported alignments | |
3846 # 8 The read is one of a pair and has no reported alignments | |
3847 # 16 The alignment is to the reverse reference strand | |
3848 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand | |
3849 # 64 The read is mate 1 in a pair | |
3850 # 128 The read is mate 2 in a pair | |
3851 # 256 The read has multiple mapping states | |
3852 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *) | |
3853 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads) | |
3854 # (5) Mapping quality (255 means MAPQ is not available) | |
3855 # (6) CIGAR string representation of alignment (* if unavailable) | |
3856 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate. | |
3857 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate. | |
3858 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate. | |
3859 # (10) Read sequence (reverse-complemented if aligned to the reverse strand) | |
3860 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file. | |
3861 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment: | |
3862 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read. | |
3863 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read. | |
3864 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment. | |
3865 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read. | |
3866 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read. | |
3867 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
3868 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read. | |
3869 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read. | |
3870 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out. | |
3871 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read. | |
3872 | |
3873 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128). | |
3874 ### We can store the next alignment and move on to the next Bowtie 2 instance | |
3875 if ($flag_1 == 77 and $flag_2 == 141){ | |
3876 ## reading in the next alignment, which must be the next sequence | |
3877 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
3878 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
3879 | |
3880 if ($newline_1 and $newline_2){ | |
3881 chomp $newline_1; | |
3882 chomp $newline_2; | |
3883 my ($seq_id_1) = split (/\t/,$newline_1); | |
3884 my ($seq_id_2) = split (/\t/,$newline_2); | |
3885 $seq_id_1 =~ s/\/1$//; | |
3886 $seq_id_2 =~ s/\/2$//; | |
3887 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
3888 $fhs[$index]->{last_line_1} = $newline_1; | |
3889 $fhs[$index]->{last_line_2} = $newline_2; | |
3890 | |
3891 # print "current sequence ($identifier) did not map, reading in next sequence\n"; | |
3892 # print "$index\t$fhs[$index]->{last_seq_id}\n"; | |
3893 # print "$index\t$fhs[$index]->{last_line_1}\n"; | |
3894 # print "$index\t$fhs[$index]->{last_line_2}\n"; | |
3895 next; # next instance | |
3896 } | |
3897 else{ | |
3898 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
3899 $fhs[$index]->{last_seq_id} = undef; | |
3900 $fhs[$index]->{last_line_1} = undef; | |
3901 $fhs[$index]->{last_line_2} = undef; | |
3902 next; | |
3903 } | |
3904 } | |
3905 | |
3906 ### If there are one or more proper alignments we can extract the chromosome number | |
3907 my ($chromosome_1,$chromosome_2); | |
3908 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){ | |
3909 $chromosome_1 = $mapped_chromosome_1; | |
3910 } | |
3911 else{ | |
3912 die "Chromosome number extraction failed for $mapped_chromosome_1\n"; | |
3913 } | |
3914 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){ | |
3915 $chromosome_2 = $mapped_chromosome_2; | |
3916 } | |
3917 else{ | |
3918 die "Chromosome number extraction failed for $mapped_chromosome_2\n"; | |
3919 } | |
3920 | |
3921 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2); | |
3922 | |
3923 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string | |
3924 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2); | |
3925 | |
3926 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1}); | |
3927 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2}); | |
3928 | |
3929 foreach (11..$#fields_1){ | |
3930 if ($fields_1[$_] =~ /AS:i:(.*)/){ | |
3931 $alignment_score_1 = $1; | |
3932 } | |
3933 elsif ($fields_1[$_] =~ /XS:i:(.*)/){ | |
3934 $second_best_1 = $1; | |
3935 } | |
3936 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){ | |
3937 $MD_tag_1 = $1; | |
3938 } | |
3939 } | |
3940 | |
3941 foreach (11..$#fields_2){ | |
3942 if ($fields_2[$_] =~ /AS:i:(.*)/){ | |
3943 $alignment_score_2 = $1; | |
3944 } | |
3945 elsif ($fields_2[$_] =~ /XS:i:(.*)/){ | |
3946 $second_best_2 = $1; | |
3947 } | |
3948 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){ | |
3949 $MD_tag_2 = $1; | |
3950 } | |
3951 } | |
3952 | |
3953 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1); | |
3954 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2); | |
3955 | |
3956 # warn "First read 1 alignment score is: '$alignment_score_1'\n"; | |
3957 # warn "First read 2 alignment score is: '$alignment_score_2'\n"; | |
3958 # warn "MD tag 1 is: '$MD_tag_1'\n"; | |
3959 # warn "MD tag 2 is: '$MD_tag_2'\n"; | |
3960 | |
3961 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments | |
3962 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ; | |
3963 # warn "sum of alignment scores: $sum_of_alignment_scores_1\n\n"; sleep(1); | |
3964 | |
3965 my $overwrite = 0; # If there are 2 alternative alignments to the same position, e.g. OT with 50 mismatches and CTOB with 0 mismatches, the CTOB one trumps the OT one. | |
3966 # introduced 13 April 2016 as a suggestion by Sylvain Foret, ANU Canberra | |
3967 | |
3968 if (!defined $best_AS_so_far){ | |
3969 $overwrite = 1; | |
3970 $best_AS_so_far = $sum_of_alignment_scores_1; | |
3971 # warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n"; | |
3972 if ($ambig_bam){ # also setting the first_ambig_alignment | |
3973 # Read 1 | |
3974 $first_ambig_alignment_line1 = $fhs[$index]->{last_line_1}; | |
3975 $first_ambig_alignment_line1 =~ s/_(CT|GA)_converted//; | |
3976 # Read 2 | |
3977 $first_ambig_alignment_line2 = $fhs[$index]->{last_line_2}; | |
3978 $first_ambig_alignment_line2 =~ s/_(CT|GA)_converted//; | |
3979 # warn "$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n\n"; sleep(1); | |
3980 } | |
3981 } | |
3982 else{ | |
3983 if ($sum_of_alignment_scores_1 >= $best_AS_so_far){ # AS are generally negative with a maximum of 0 | |
3984 # 19 07 2016 Changed to >= so that equally good alignments to different positions get added as well. Ambiguous alignments are identified and removed later. | |
3985 $best_AS_so_far = $sum_of_alignment_scores_1; | |
3986 $overwrite = 1; | |
3987 | |
3988 # warn "Found better or equal sum of alignment scores ($sum_of_alignment_scores_1), setting \$best_AS_so_far to $best_AS_so_far\n"; | |
3989 # resetting the ambiguous within thread memory (if applicable at all) only if the current alignment is really better than the previous one. | |
3990 # 22 07 2016: ambiguous score within same thread only resets if the current alignment is really better than the previous one | |
3991 if ($sum_of_alignment_scores_1 > $best_AS_so_far){ | |
3992 # warn "Resetting amb within thread value to 0\n"; | |
3993 $amb_same_thread = 0; | |
3994 | |
3995 if ($ambig_bam){ # also setting a new first_ambig_alignment | |
3996 # Read 1 | |
3997 $first_ambig_alignment_line1 = $fhs[$index]->{last_line_1}; | |
3998 $first_ambig_alignment_line1 =~ s/_(CT|GA)_converted//; | |
3999 # Read 2 | |
4000 $first_ambig_alignment_line2 = $fhs[$index]->{last_line_2}; | |
4001 $first_ambig_alignment_line2 =~ s/_(CT|GA)_converted//; | |
4002 # warn "$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n\n"; sleep(1); | |
4003 } | |
4004 } | |
4005 } | |
4006 else{ | |
4007 # warn "current alignment (AS $sum_of_alignment_scores) isn't better than the best so far ($best_AS_so_far). Not changing anything\n"; | |
4008 } | |
4009 } | |
4010 | |
4011 if (defined $second_best_1 and defined $second_best_2){ | |
4012 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2; | |
4013 # warn "Second best alignment_score_1 is: '$second_best_1'\n"; | |
4014 # warn "Second best alignment_score_2 is: '$second_best_2'\n"; | |
4015 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n"; | |
4016 | |
4017 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we we keep a memory of this | |
4018 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){ | |
4019 | |
4020 # checking to see if this read pair produced the best alignment | |
4021 if ($sum_of_alignment_scores_1 == $best_AS_so_far){ # yes this is the best read pair so far, either within the thread or between threads, however it is ambiguous | |
4022 # warn "Read pair is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n"; | |
4023 $amb_same_thread = 1; | |
4024 } | |
4025 else{ | |
4026 # warn "This read pair has a worse alignment score than the best alignment so far and will be ignored even though it is ambiguous in itself\n"; | |
4027 } | |
4028 | |
4029 ### if there is a better alignment later on -> fine. If not, the read will get booted altogether one way or another | |
4030 | |
4031 ## need to read and discard all additional ambiguous reads until we reach the next sequence | |
4032 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
4033 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
4034 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
4035 if ($newline_1 and $newline_2){ | |
4036 chomp $newline_1; | |
4037 chomp $newline_2; | |
4038 my ($seq_id_1) = split (/\t/,$newline_1); | |
4039 my ($seq_id_2) = split (/\t/,$newline_2); | |
4040 $seq_id_1 =~ s/\/1$//; | |
4041 $seq_id_2 =~ s/\/2$//; | |
4042 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; | |
4043 | |
4044 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
4045 $fhs[$index]->{last_line_1} = $newline_1; | |
4046 $fhs[$index]->{last_line_2} = $newline_2; | |
4047 } | |
4048 else{ | |
4049 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output) | |
4050 $fhs[$index]->{last_seq_id} = undef; | |
4051 $fhs[$index]->{last_line_1} = undef; | |
4052 $fhs[$index]->{last_line_2} = undef; | |
4053 last; # break free if the end of the alignment output was reached | |
4054 } | |
4055 } | |
4056 # if ($fhs[$index]->{last_seq_id}){ | |
4057 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n"; | |
4058 # } | |
4059 } | |
4060 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment | |
4061 | |
4062 my $alignment_location; | |
4063 if ($position_1 <= $position_2){ | |
4064 $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
4065 } | |
4066 elsif($position_2 < $position_1){ | |
4067 $alignment_location = join(":",$chromosome_1,$position_2,$position_1); | |
4068 } | |
4069 | |
4070 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
4071 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
4072 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
4073 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB | |
4074 | |
4075 if ($overwrite){ # see comment above at "my $overwrite = ..." | |
4076 #unless (exists $alignments{$alignment_location}){ | |
4077 $alignments{$alignment_location}->{seq_id} = $id_1; | |
4078 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; | |
4079 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; | |
4080 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; | |
4081 $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = $sum_of_alignment_scores_second_best; | |
4082 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; | |
4083 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; | |
4084 $alignments{$alignment_location}->{index} = $index; | |
4085 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine | |
4086 $alignments{$alignment_location}->{position_1} = $position_1; | |
4087 $alignments{$alignment_location}->{position_2} = $position_2; | |
4088 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; | |
4089 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; | |
4090 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; | |
4091 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; | |
4092 $alignments{$alignment_location}->{flag_1} = $flag_1; | |
4093 $alignments{$alignment_location}->{flag_2} = $flag_2; | |
4094 # warn "added best of several alignments to \%alignments hash\n"; | |
4095 } | |
4096 | |
4097 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence | |
4098 until ($fhs[$index]->{last_seq_id} ne $identifier){ | |
4099 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
4100 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
4101 if ($newline_1 and $newline_2){ | |
4102 chomp $newline_1; | |
4103 chomp $newline_2; | |
4104 my ($seq_id_1) = split (/\t/,$newline_1); | |
4105 my ($seq_id_2) = split (/\t/,$newline_2); | |
4106 $seq_id_1 =~ s/\/1$//; | |
4107 $seq_id_2 =~ s/\/2$//; | |
4108 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; | |
4109 | |
4110 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
4111 $fhs[$index]->{last_line_1} = $newline_1; | |
4112 $fhs[$index]->{last_line_2} = $newline_2; | |
4113 } | |
4114 else{ | |
4115 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) | |
4116 $fhs[$index]->{last_seq_id} = undef; | |
4117 $fhs[$index]->{last_line_1} = undef; | |
4118 $fhs[$index]->{last_line_2} = undef; | |
4119 last; # break free if the end of the alignment output was reached | |
4120 } | |
4121 } | |
4122 # if($fhs[$index]->{last_seq_id}){ | |
4123 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n"; | |
4124 # } | |
4125 } | |
4126 } | |
4127 else{ # there is no second best hit, so we can just store this one and read in the next sequence | |
4128 | |
4129 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2); | |
4130 # print "$alignment_location\n"; | |
4131 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse | |
4132 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite | |
4133 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only | |
4134 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB | |
4135 | |
4136 #unless (exists $alignments{$alignment_location}){ # see comment above at my $overwrite = ... | |
4137 if ($overwrite){ | |
4138 $alignments{$alignment_location}->{seq_id} = $id_1; | |
4139 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1; | |
4140 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2; | |
4141 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1; | |
4142 $alignments{$alignment_location}->{sum_of_alignment_scores_second_best} = undef; | |
4143 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1; | |
4144 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2; | |
4145 $alignments{$alignment_location}->{index} = $index; | |
4146 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine | |
4147 $alignments{$alignment_location}->{position_1} = $position_1; | |
4148 $alignments{$alignment_location}->{position_2} = $position_2; | |
4149 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1; | |
4150 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2; | |
4151 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1; | |
4152 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2; | |
4153 $alignments{$alignment_location}->{flag_1} = $flag_1; | |
4154 $alignments{$alignment_location}->{flag_2} = $flag_2; | |
4155 # warn "added unique alignment to \%alignments hash\n"; | |
4156 } | |
4157 | |
4158 # Now reading and storing the next read pair | |
4159 my $newline_1 = $fhs[$index]->{fh}-> getline(); | |
4160 my $newline_2 = $fhs[$index]->{fh}-> getline(); | |
4161 if ($newline_1 and $newline_2){ | |
4162 chomp $newline_1; | |
4163 chomp $newline_2; | |
4164 # print "$newline_1\n"; | |
4165 # print "$newline_2\n"; | |
4166 my ($seq_id_1) = split (/\t/,$newline_1); | |
4167 my ($seq_id_2) = split (/\t/,$newline_2); | |
4168 $seq_id_1 =~ s/\/1$//; | |
4169 $seq_id_2 =~ s/\/2$//; | |
4170 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n"; | |
4171 | |
4172 $fhs[$index]->{last_seq_id} = $seq_id_1; | |
4173 $fhs[$index]->{last_line_1} = $newline_1; | |
4174 $fhs[$index]->{last_line_2} = $newline_2; | |
4175 | |
4176 if ($seq_id_1 eq $identifier){ | |
4177 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n"; | |
4178 } | |
4179 } | |
4180 else{ | |
4181 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output) | |
4182 $fhs[$index]->{last_seq_id} = undef; | |
4183 $fhs[$index]->{last_line_1} = undef; | |
4184 $fhs[$index]->{last_line_2} = undef; | |
4185 } | |
4186 } | |
4187 } | |
4188 } | |
4189 | |
4190 ### If there were several equally good alignments for the best alignment score we will boot the read | |
4191 if ($amb_same_thread){ | |
4192 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; | |
4193 $alignment_ambiguous = 1; | |
4194 # warn "\$alignment_ambiguous now: $alignment_ambiguous\n"; | |
4195 } | |
4196 else{ | |
4197 # warn "alignment won't be considered ambiguous. This time....\n"; | |
4198 } | |
4199 | |
4200 | |
4201 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format | |
4202 if ($alignment_ambiguous == 1){ | |
4203 $counting{unsuitable_sequence_count}++; | |
4204 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
4205 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); | |
4206 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); | |
4207 # print "$ambiguous_read_1\n"; | |
4208 # print "$ambiguous_read_2\n"; | |
4209 | |
4210 if ($ambig_bam){ | |
4211 # warn "Sequence is ambiguous, printing out to ambiguous BAM file:\n"; | |
4212 # replacing the first /1\t in the ID of R1 | |
4213 # warn "Was\n$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n"; | |
4214 $first_ambig_alignment_line1 =~ s/\/1\t/\t/; | |
4215 $first_ambig_alignment_line2 =~ s/\/2\t/\t/; | |
4216 # warn "Is:\n$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n\n"; | |
4217 | |
4218 print AMBIBAM "$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n"; | |
4219 # print "$first_ambig_alignment_line1\n$first_ambig_alignment_line2\n"; | |
4220 } | |
4221 | |
4222 if ($ambiguous){ | |
4223 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified | |
4224 } | |
4225 elsif ($unmapped){ | |
4226 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified | |
4227 } | |
4228 else{ | |
4229 return 0; | |
4230 } | |
4231 } | |
4232 | |
4233 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file | |
4234 unless (%alignments){ | |
4235 $counting{no_single_alignment_found}++; | |
4236 | |
4237 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); | |
4238 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); | |
4239 # print "$unmapped_read_1\n"; | |
4240 # print "$unmapped_read_2\n"; | |
4241 if ($unmapped){ | |
4242 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified | |
4243 } | |
4244 else{ | |
4245 return 0; | |
4246 } | |
4247 } | |
4248 | |
4249 ####################################################################################################################################################### | |
4250 | |
4251 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one | |
4252 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest) | |
4253 ### alignment score we are discarding the sequence pair altogether. | |
4254 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5) | |
4255 ### and extending (3 per bp) the gap. | |
4256 | |
4257 ####################################################################################################################################################### | |
4258 | |
4259 ### Declaring an empty hash reference which will store all information we need for the methylation call | |
4260 my $methylation_call_params; # hash reference | |
4261 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then) | |
4262 | |
4263 ### print contents of %alignments for debugging | |
4264 ## if (scalar keys %alignments >= 1){ | |
4265 # print "\n******\n"; | |
4266 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){ | |
4267 # print "Loc: $alignment_location\n"; | |
4268 # print "ID: $alignments{$alignment_location}->{seq_id}\n"; | |
4269 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n"; | |
4270 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n"; | |
4271 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n"; | |
4272 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n"; | |
4273 # print "Index $alignments{$alignment_location}->{index}\n"; | |
4274 # print "Chr: $alignments{$alignment_location}->{chromosome}\n"; | |
4275 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n"; | |
4276 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n"; | |
4277 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n"; | |
4278 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n"; | |
4279 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n"; | |
4280 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n"; | |
4281 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n"; | |
4282 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n"; | |
4283 # } | |
4284 # print "\n******\n"; | |
4285 # } | |
4286 | |
4287 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment | |
4288 if (scalar keys %alignments == 1){ | |
4289 for my $unique_best_alignment (keys %alignments){ | |
4290 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1}; | |
4291 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2}; | |
4292 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome}; | |
4293 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1}; | |
4294 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2}; | |
4295 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index}; | |
4296 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1}; | |
4297 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2}; | |
4298 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores}; | |
4299 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores_second_best}; | |
4300 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1}; | |
4301 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2}; | |
4302 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1}; | |
4303 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2}; | |
4304 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1}; | |
4305 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2}; | |
4306 } | |
4307 } | |
4308 | |
4309 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case | |
4310 ### we boot the sequence pair altogether) | |
4311 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){ | |
4312 my $best_sum_of_alignment_scores; | |
4313 my $best_alignment_location; | |
4314 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){ | |
4315 | |
4316 # warn "$alignments{$alignment_location}->{sum_of_alignment_scores}\n"; sleep(1); | |
4317 | |
4318 unless (defined $best_sum_of_alignment_scores){ | |
4319 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores}; | |
4320 $best_alignment_location = $alignment_location; | |
4321 # print "setting best alignment score to: $best_sum_of_alignment_scores\n"; | |
4322 } | |
4323 else{ | |
4324 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted | |
4325 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){ | |
4326 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n"; | |
4327 $sequence_pair_fails = 1; | |
4328 last; # exiting since we know that the sequence has ambiguous alignments | |
4329 } | |
4330 ### else we are going to store the best alignment for further processing | |
4331 else{ | |
4332 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1}; | |
4333 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2}; | |
4334 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome}; | |
4335 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1}; | |
4336 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2}; | |
4337 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index}; | |
4338 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1}; | |
4339 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2}; | |
4340 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores}; | |
4341 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1}; | |
4342 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2}; | |
4343 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1}; | |
4344 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2}; | |
4345 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1}; | |
4346 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2}; | |
4347 | |
4348 if (defined $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} and ( $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best} > $alignments{$alignment_location}->{sum_of_alignment_scores} )) { | |
4349 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$best_alignment_location}->{sum_of_alignment_scores_second_best}; | |
4350 } | |
4351 else { | |
4352 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best} = $alignments{$alignment_location}->{sum_of_alignment_scores}; | |
4353 } | |
4354 | |
4355 last; # exiting since the sequence produced a unique best alignment | |
4356 } | |
4357 } | |
4358 } | |
4359 } | |
4360 else{ | |
4361 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";; | |
4362 } | |
4363 | |
4364 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions | |
4365 if ($sequence_pair_fails == 1){ | |
4366 $counting{unsuitable_sequence_count}++; | |
4367 | |
4368 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else | |
4369 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1); | |
4370 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2); | |
4371 # warn "$ambiguous_read_1\n"; | |
4372 # warn "$ambiguous_read_2\n"; | |
4373 | |
4374 if ($ambiguous){ | |
4375 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified | |
4376 } | |
4377 elsif ($unmapped){ | |
4378 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified | |
4379 } | |
4380 else{ | |
4381 return 0; # => exits to next sequence pair (default) | |
4382 } | |
4383 } | |
4384 | |
4385 ### --DIRECTIONAL | |
4386 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore | |
4387 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol | |
4388 if ($directional){ | |
4389 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){ | |
4390 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n"; | |
4391 $counting{alignments_rejected_count}++; | |
4392 return 0; | |
4393 } | |
4394 } | |
4395 | |
4396 ### If the sequence pair has not been rejected so far it does have a unique best alignment | |
4397 $counting{unique_best_alignment_count}++; | |
4398 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params); | |
4399 | |
4400 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call | |
4401 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){ | |
4402 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_1}\n"; | |
4403 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
4404 return 0; | |
4405 } | |
4406 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){ | |
4407 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position_2}\n"; | |
4408 $counting{genomic_sequence_could_not_be_extracted_count}++; | |
4409 return 0; | |
4410 } | |
4411 | |
4412 ### Compute MAPQ value | |
4413 $methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence_1), length($sequence_2), | |
4414 $methylation_call_params->{$identifier}->{sum_of_alignment_scores}, | |
4415 $methylation_call_params->{$identifier}->{sum_of_alignment_scores_second_best}); | |
4416 | |
4417 | |
4418 ### now we are set to perform the actual methylation call | |
4419 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1}); | |
4420 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2}); | |
4421 # warn "$methylation_call_params->{$identifier}->{read_conversion_2}\n"; | |
4422 # warn " $sequence_2\n"; | |
4423 # warn "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n"; | |
4424 # warn " $methylation_call_params->{$identifier}->{methylation_call_2}\n"; | |
4425 | |
4426 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); | |
4427 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2 | |
4428 } | |
4429 | |
4430 ### | |
4431 | |
4432 # Compute MAPQ value for a read or read pair as in Bowtie2-2.2.2 (specifically, V2 of the MAPQ calculator: "class BowtieMapq2") | |
4433 # assuming end-to-end alignment with the default calculation of the minimum alignment score | |
4434 | |
4435 sub calc_mapq { | |
4436 my ($read1Len, $read2Len, $AS_best, $AS_secBest) = @_; | |
4437 | |
4438 my $scMin = $score_min_intercept + $score_min_slope * $read1Len; | |
4439 ### read2Len is only defined for paired-end reads, so for single-end mode we can just a score min value for read 1 | |
4440 if (defined $read2Len){ | |
4441 $scMin += $score_min_intercept + $score_min_slope * $read2Len; | |
4442 } | |
4443 | |
4444 my $diff = abs$scMin; # scores can vary by up to this much (since max AS is 0 for end-to-end alignment) | |
4445 my $bestOver = $AS_best - $scMin; | |
4446 | |
4447 if (!defined $AS_secBest) { | |
4448 if ($bestOver >= $diff * 0.8) { return 42; } | |
4449 elsif ($bestOver >= $diff * 0.7) { return 40; } | |
4450 elsif ($bestOver >= $diff * 0.6) { return 24; } | |
4451 elsif ($bestOver >= $diff * 0.5) { return 23; } | |
4452 elsif ($bestOver >= $diff * 0.4) { return 8; } | |
4453 elsif ($bestOver >= $diff * 0.3) { return 3; } | |
4454 else { return 0; } | |
4455 } else { | |
4456 my $bestDiff = abs(abs($AS_best) - abs($AS_secBest)); | |
4457 if ($bestDiff >= $diff * 0.9) { | |
4458 if ($bestOver == $diff) { | |
4459 return 39; | |
4460 } else { | |
4461 return 33; | |
4462 } | |
4463 } elsif ($bestDiff >= $diff * 0.8) { | |
4464 if ($bestOver == $diff) { | |
4465 return 38; | |
4466 } else { | |
4467 return 27; | |
4468 } | |
4469 } elsif ($bestDiff >= $diff * 0.7) { | |
4470 if ($bestOver == $diff) { | |
4471 return 37; | |
4472 } else { | |
4473 return 26; | |
4474 } | |
4475 } elsif ($bestDiff >= $diff * 0.6) { | |
4476 if ($bestOver == $diff) { | |
4477 return 36; | |
4478 } else { | |
4479 return 22; | |
4480 } | |
4481 } elsif ($bestDiff >= $diff * 0.5) { | |
4482 if ($bestOver == $diff) { | |
4483 return 35; | |
4484 } elsif ($bestOver >= $diff * 0.84) { | |
4485 return 25; | |
4486 } elsif ($bestOver >= $diff * 0.68) { | |
4487 return 16; | |
4488 } else { | |
4489 return 5; | |
4490 } | |
4491 } elsif ($bestDiff >= $diff * 0.4) { | |
4492 if ($bestOver == $diff) { | |
4493 return 34; | |
4494 } elsif ($bestOver >= $diff * 0.84) { | |
4495 return 21; | |
4496 } elsif ($bestOver >= $diff * 0.68) { | |
4497 return 14; | |
4498 } else { | |
4499 return 4; | |
4500 } | |
4501 } elsif ($bestDiff >= $diff * 0.3) { | |
4502 if ($bestOver == $diff) { | |
4503 return 32; | |
4504 } elsif ($bestOver >= $diff * 0.88) { | |
4505 return 18; | |
4506 } elsif ($bestOver >= $diff * 0.67) { | |
4507 return 15; | |
4508 } else { | |
4509 return 3; | |
4510 } | |
4511 } elsif ($bestDiff >= $diff * 0.2) { | |
4512 if ($bestOver == $diff) { | |
4513 return 31; | |
4514 } elsif ($bestOver >= $diff * 0.88) { | |
4515 return 17; | |
4516 } elsif ($bestOver >= $diff * 0.67) { | |
4517 return 11; | |
4518 } else { | |
4519 return 0; | |
4520 } | |
4521 } elsif ($bestDiff >= $diff * 0.1) { | |
4522 if ($bestOver == $diff) { | |
4523 return 30; | |
4524 } elsif ($bestOver >= $diff * 0.88) { | |
4525 return 12; | |
4526 } elsif ($bestOver >= $diff * 0.67) { | |
4527 return 7; | |
4528 } else { | |
4529 return 0; | |
4530 } | |
4531 } elsif ($bestDiff > 0) { | |
4532 if ($bestOver >= $diff * 0.67) { | |
4533 return 6; | |
4534 } else { | |
4535 return 2; | |
4536 } | |
4537 } else { | |
4538 if ($bestOver >= $diff * 0.67) { | |
4539 return 1; | |
4540 } else { | |
4541 return 0; | |
4542 } | |
4543 } | |
4544 } | |
4545 } | |
4546 | |
4547 | |
4548 ### | |
4549 | |
4550 sub decide_whether_paired_end_alignment_is_valid{ | |
4551 my ($index,$identifier) = @_; | |
4552 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7]; | |
4553 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7]; | |
4554 chomp $mismatch_info_1; | |
4555 chomp $mismatch_info_2; | |
4556 my $seq_id_1 = $id_1; | |
4557 my $seq_id_2 = $id_2; | |
4558 $seq_id_1 =~ s/\/1$//; # removing the read /1 | |
4559 $seq_id_2 =~ s/\/1$//; # removing the read /1 | |
4560 | |
4561 ### ensuring that the current entry is the correct sequence | |
4562 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ | |
4563 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically | |
4564 ### sensible alignments | |
4565 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); | |
4566 ### If the orientation was correct can we move on | |
4567 if ($orientation == 1){ | |
4568 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS | |
4569 } | |
4570 ### If the alignment was in the wrong orientation we need to read in two new lines | |
4571 elsif($orientation == 0){ | |
4572 my $newline_1 = $fhs[$index]->{fh}->getline(); | |
4573 my $newline_2 = $fhs[$index]->{fh}->getline(); | |
4574 if ($newline_1 and $newline_2){ | |
4575 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time) | |
4576 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1]; | |
4577 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1]; | |
4578 | |
4579 my $seqid; | |
4580 $seq_id_1 = $id_1; | |
4581 $seq_id_2 = $id_2; | |
4582 # we need to capture the first read (ending on /1) | |
4583 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
4584 $seqid = $seq_id_1; | |
4585 } | |
4586 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
4587 $seqid = $seq_id_2; | |
4588 } | |
4589 else{ | |
4590 die "One of the two reads needs to end on /1!!"; | |
4591 } | |
4592 | |
4593 ### ensuring that the next entry is still the correct sequence | |
4594 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){ | |
4595 ### checking orientation again | |
4596 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2); | |
4597 ### If the orientation was correct can we move on | |
4598 if ($orientation == 1){ | |
4599 ### Writing the current sequence to last_line_1 and last_line_2 | |
4600 $fhs[$index]->{last_seq_id} = $seqid; | |
4601 $fhs[$index]->{last_line_1} = $newline_1; | |
4602 $fhs[$index]->{last_line_2} = $newline_2; | |
4603 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS | |
4604 } | |
4605 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be | |
4606 ### the next entry) | |
4607 elsif ($orientation == 0){ | |
4608 $newline_1 = $fhs[$index]->{fh}->getline(); | |
4609 $newline_2 = $fhs[$index]->{fh}->getline(); | |
4610 if ($newline_1 and $newline_2){ | |
4611 ($seq_id_1) = split (/\t/,$newline_1); | |
4612 ($seq_id_2) = split (/\t/,$newline_2); | |
4613 | |
4614 $seqid = ''; | |
4615 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag | |
4616 $seqid = $seq_id_1; | |
4617 } | |
4618 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag | |
4619 $seqid = $seq_id_2; | |
4620 } | |
4621 else{ | |
4622 die "One of the two reads needs to end on /1!!"; | |
4623 } | |
4624 | |
4625 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with | |
4626 ### the same fields of the just read next entry | |
4627 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier); | |
4628 $fhs[$index]->{last_seq_id} = $seqid; | |
4629 $fhs[$index]->{last_line_1} = $newline_1; | |
4630 $fhs[$index]->{last_line_2} = $newline_2; | |
4631 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation | |
4632 } | |
4633 else { | |
4634 ### assigning undef to last_seq_id and last_line (end of bowtie output) | |
4635 $fhs[$index]->{last_seq_id} = undef; | |
4636 $fhs[$index]->{last_line_1} = undef; | |
4637 $fhs[$index]->{last_line_2} = undef; | |
4638 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation | |
4639 } | |
4640 } | |
4641 else{ | |
4642 die "The orientation of the alignment must be either correct or incorrect\n"; | |
4643 } | |
4644 } | |
4645 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs | |
4646 else{ | |
4647 $fhs[$index]->{last_seq_id} = $seqid; | |
4648 $fhs[$index]->{last_line_1} = $newline_1; | |
4649 $fhs[$index]->{last_line_2} = $newline_2; | |
4650 return 0; # processing the new alignment result only in the next round | |
4651 } | |
4652 } | |
4653 else { | |
4654 # assigning undef to last_seq_id and both last_lines (end of bowtie output) | |
4655 $fhs[$index]->{last_seq_id} = undef; | |
4656 $fhs[$index]->{last_line_1} = undef; | |
4657 $fhs[$index]->{last_line_2} = undef; | |
4658 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation | |
4659 } | |
4660 } | |
4661 else{ | |
4662 die "The orientation of the alignment must be either correct or incorrect\n"; | |
4663 } | |
4664 } | |
4665 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round | |
4666 else{ | |
4667 return 0; | |
4668 } | |
4669 } | |
4670 | |
4671 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END | |
4672 | |
4673 sub extract_corresponding_genomic_sequence_paired_ends { | |
4674 my ($sequence_identifier,$methylation_call_params) = @_; | |
4675 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the | |
4676 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
4677 my $alignment_read_1; | |
4678 my $alignment_read_2; | |
4679 my $read_conversion_info_1; | |
4680 my $read_conversion_info_2; | |
4681 my $genome_conversion; | |
4682 | |
4683 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call | |
4684 ### if the C happens to be at the first or last position of the actually observed sequence | |
4685 my $non_bisulfite_sequence_1; | |
4686 my $non_bisulfite_sequence_2; | |
4687 | |
4688 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was | |
4689 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic | |
4690 ### sequences around! | |
4691 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) | |
4692 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ | |
4693 ### [Index 0, sequence originated from (converted) forward strand] | |
4694 $counting{CT_GA_CT_count}++; | |
4695 $alignment_read_1 = '+'; | |
4696 $alignment_read_2 = '-'; | |
4697 $read_conversion_info_1 = 'CT'; | |
4698 $read_conversion_info_2 = 'GA'; | |
4699 $genome_conversion = 'CT'; | |
4700 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) | |
4701 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end | |
4702 | |
4703 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change | |
4704 | |
4705 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) | |
4706 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation | |
4707 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1 | |
4708 | |
4709 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); | |
4710 ### the reverse strand sequence needs to be reverse complemented | |
4711 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
4712 } | |
4713 else{ | |
4714 $non_bisulfite_sequence_2 = ''; | |
4715 } | |
4716 } | |
4717 | |
4718 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) | |
4719 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ | |
4720 ### [Index 1, sequence originated from complementary to (converted) reverse strand] | |
4721 $counting{GA_CT_GA_count}++; | |
4722 $alignment_read_1 = '+'; | |
4723 $alignment_read_2 = '-'; | |
4724 $read_conversion_info_1 = 'GA'; | |
4725 $read_conversion_info_2 = 'CT'; | |
4726 $genome_conversion = 'GA'; | |
4727 | |
4728 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1) | |
4729 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end | |
4730 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1 | |
4731 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2 | |
4732 } | |
4733 else{ | |
4734 $non_bisulfite_sequence_1 = ''; | |
4735 } | |
4736 | |
4737 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2) | |
4738 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation | |
4739 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2 | |
4740 ### the reverse strand sequence needs to be reverse complemented | |
4741 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
4742 } | |
4743 | |
4744 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) | |
4745 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ | |
4746 ### [Index 2, sequence originated from the complementary to (converted) forward strand] | |
4747 $counting{GA_CT_CT_count}++; | |
4748 $alignment_read_1 = '-'; | |
4749 $alignment_read_2 = '+'; | |
4750 $read_conversion_info_1 = 'GA'; | |
4751 $read_conversion_info_2 = 'CT'; | |
4752 $genome_conversion = 'CT'; | |
4753 | |
4754 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! | |
4755 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand | |
4756 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation | |
4757 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2 | |
4758 ### the reverse strand sequence needs to be reverse complemented | |
4759 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
4760 | |
4761 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) | |
4762 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! | |
4763 ### Read 2 is CT converted so we need to capture 2 extra 3' bases | |
4764 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1 | |
4765 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2 | |
4766 } | |
4767 else{ | |
4768 $non_bisulfite_sequence_2 = ''; | |
4769 } | |
4770 } | |
4771 | |
4772 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) | |
4773 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ | |
4774 ### [Index 3, sequence originated from the (converted) reverse strand] | |
4775 $counting{CT_GA_GA_count}++; | |
4776 $alignment_read_1 = '-'; | |
4777 $alignment_read_2 = '+'; | |
4778 $read_conversion_info_1 = 'CT'; | |
4779 $read_conversion_info_2 = 'GA'; | |
4780 $genome_conversion = 'GA'; | |
4781 | |
4782 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!! | |
4783 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand | |
4784 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation | |
4785 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1 | |
4786 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2 | |
4787 ### the reverse strand sequence needs to be reverse complemented | |
4788 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
4789 } | |
4790 else{ | |
4791 $non_bisulfite_sequence_1 = ''; | |
4792 } | |
4793 | |
4794 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1) | |
4795 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!! | |
4796 ### Read 2 is GA converted so we need to capture 2 extra 5' bases | |
4797 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2 | |
4798 } | |
4799 else{ | |
4800 die "Too many bowtie result filehandles\n"; | |
4801 } | |
4802 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
4803 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
4804 | |
4805 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; | |
4806 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; | |
4807 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
4808 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; | |
4809 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; | |
4810 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4811 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
4812 } | |
4813 | |
4814 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END | |
4815 | |
4816 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{ | |
4817 my ($sequence_identifier,$methylation_call_params) = @_; | |
4818 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the | |
4819 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
4820 | |
4821 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1}; | |
4822 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2}; | |
4823 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1}; | |
4824 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2}; | |
4825 | |
4826 my $contains_deletion_1 = 0; | |
4827 my $contains_deletion_2 = 0; | |
4828 if ($cigar_1 =~ /D/){ | |
4829 $contains_deletion_1 = 1; | |
4830 if ($verbose){ warn "$cigar_1\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_1}\n";} | |
4831 } | |
4832 if ($cigar_2 =~ /D/){ | |
4833 $contains_deletion_2 = 1; | |
4834 if ($verbose){ warn "$cigar_2\n$methylation_call_params->{$sequence_identifier}->{mismatch_info_2}\n";} | |
4835 } | |
4836 | |
4837 # warn "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n"; | |
4838 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and | |
4839 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence | |
4840 | |
4841 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
4842 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
4843 my $alignment_read_1; | |
4844 my $alignment_read_2; | |
4845 my $read_conversion_info_1; | |
4846 my $read_conversion_info_2; | |
4847 my $genome_conversion; | |
4848 | |
4849 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call | |
4850 ### if the C happens to be at the last position of the actually observed sequence | |
4851 my $non_bisulfite_sequence_1 = ''; | |
4852 my $non_bisulfite_sequence_2 = ''; | |
4853 my $genomic_seq_for_MD_tag_1 = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output | |
4854 my $genomic_seq_for_MD_tag_2 = ''; | |
4855 | |
4856 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings | |
4857 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1; | |
4858 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1; | |
4859 | |
4860 # parsing CIGAR 1 string | |
4861 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation | |
4862 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation | |
4863 shift @ops_1; # remove the empty first element | |
4864 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1); | |
4865 # parsing CIGAR 2 string | |
4866 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation | |
4867 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation | |
4868 shift @ops_2; # remove the empty first element | |
4869 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2); | |
4870 | |
4871 my $indels_1 = 0; # adding these to the hemming distance value (needed for the NM field in the final SAM output | |
4872 my $indels_2 = 0; | |
4873 | |
4874 ### Extracting read 1 genomic sequence ### | |
4875 | |
4876 # extracting 2 additional bp at the 5' end (read 1) | |
4877 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ | |
4878 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
4879 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise | |
4880 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4881 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1; | |
4882 return; | |
4883 } | |
4884 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2); | |
4885 } | |
4886 | |
4887 foreach (0..$#len_1){ | |
4888 if ($ops_1[$_] eq 'M'){ | |
4889 # extracting genomic sequence | |
4890 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); | |
4891 if ($contains_deletion_1){ | |
4892 $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); | |
4893 } | |
4894 # warn "$non_bisulfite_sequence_1\n"; | |
4895 # adjusting position | |
4896 $pos_1 += $len_1[$_]; | |
4897 } | |
4898 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence | |
4899 # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls, and we can later ignore it for the generation of the MD;Z: tag | |
4900 $non_bisulfite_sequence_1 .= 'X' x $len_1[$_]; | |
4901 if ($contains_deletion_1){ | |
4902 $genomic_seq_for_MD_tag_1 .= 'X' x $len_1[$_]; | |
4903 } | |
4904 # warn "$non_bisulfite_sequence_1\n"; | |
4905 # position doesn't need adjusting | |
4906 | |
4907 ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist() | |
4908 # indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
4909 } | |
4910 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence | |
4911 # we do not add any genomic sequence but only adjust the position | |
4912 # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later | |
4913 if ($contains_deletion_1){ | |
4914 $genomic_seq_for_MD_tag_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]); | |
4915 } | |
4916 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n"; | |
4917 $pos_1 += $len_1[$_]; | |
4918 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
4919 } | |
4920 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die | |
4921 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
4922 } | |
4923 else{ | |
4924 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
4925 } | |
4926 } | |
4927 | |
4928 ### 3' end of read 1 | |
4929 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ | |
4930 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
4931 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise | |
4932 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4933 return; | |
4934 } | |
4935 | |
4936 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2); | |
4937 } | |
4938 | |
4939 | |
4940 ### Extracting read 2 genomic sequence ### | |
4941 | |
4942 ### 5' end of read 2 | |
4943 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){ | |
4944 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
4945 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise | |
4946 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
4947 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
4948 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2; | |
4949 return; | |
4950 } | |
4951 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2); | |
4952 } | |
4953 | |
4954 foreach (0..$#len_2){ | |
4955 if ($ops_2[$_] eq 'M'){ | |
4956 # extracting genomic sequence | |
4957 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); | |
4958 if ($contains_deletion_2){ | |
4959 $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); | |
4960 } | |
4961 # warn "$non_bisulfite_sequence_2\n"; | |
4962 # adjusting position | |
4963 $pos_2 += $len_2[$_]; | |
4964 } | |
4965 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence | |
4966 # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can ignore this later during the generation of the MD:Z: tag | |
4967 $non_bisulfite_sequence_2 .= 'X' x $len_2[$_]; | |
4968 if ($contains_deletion_2){ | |
4969 $genomic_seq_for_MD_tag_2 .= 'X' x $len_2[$_]; | |
4970 } | |
4971 # warn "$non_bisulfite_sequence_2\n"; | |
4972 # position doesn't need adjusting | |
4973 | |
4974 ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail a base by base comparison in hemming_dist() | |
4975 # $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
4976 } | |
4977 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence | |
4978 # we do not add any genomic sequence but only adjust the position | |
4979 # we do however need to add the genomic sequence to $genomic_seq_for_MD-tag so we can create a proper MD tag later | |
4980 if ($contains_deletion_2){ | |
4981 $genomic_seq_for_MD_tag_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]); | |
4982 } | |
4983 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n"; | |
4984 $pos_2 += $len_2[$_]; | |
4985 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output | |
4986 } | |
4987 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die | |
4988 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
4989 } | |
4990 else{ | |
4991 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
4992 } | |
4993 } | |
4994 | |
4995 ### 3' end of read 2 | |
4996 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){ | |
4997 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
4998 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise | |
4999 # need to set read 1 as well now to prevent warning | |
5000 # warn "'$non_bisulfite_sequence_1'\n'$non_bisulfite_sequence_2'\n\n"; | |
5001 # sleep(5); | |
5002 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
5003 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
5004 return; | |
5005 } | |
5006 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2); | |
5007 } | |
5008 | |
5009 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was | |
5010 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly | |
5011 | |
5012 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only) | |
5013 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){ | |
5014 ### [Index 0, sequence originated from (converted) forward strand] | |
5015 $counting{CT_GA_CT_count}++; | |
5016 $alignment_read_1 = '+'; | |
5017 $alignment_read_2 = '-'; | |
5018 $read_conversion_info_1 = 'CT'; | |
5019 $read_conversion_info_2 = 'GA'; | |
5020 $genome_conversion = 'CT'; | |
5021 ### Read 1 is always the forward hit | |
5022 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented | |
5023 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
5024 if ($contains_deletion_2){ | |
5025 $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2); | |
5026 } | |
5027 } | |
5028 | |
5029 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only) | |
5030 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){ | |
5031 ### [Index 1, sequence originated from complementary to (converted) bottom strand] | |
5032 $counting{GA_CT_GA_count}++; | |
5033 $alignment_read_1 = '+'; | |
5034 $alignment_read_2 = '-'; | |
5035 $read_conversion_info_1 = 'GA'; | |
5036 $read_conversion_info_2 = 'CT'; | |
5037 $genome_conversion = 'GA'; | |
5038 ### Read 1 is always the forward hit | |
5039 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented | |
5040 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2); | |
5041 if ($contains_deletion_2){ | |
5042 $genomic_seq_for_MD_tag_2 = reverse_complement($genomic_seq_for_MD_tag_2); | |
5043 } | |
5044 } | |
5045 | |
5046 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only) | |
5047 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){ | |
5048 ### [Index 2, sequence originated from the complementary to (converted) top strand] | |
5049 $counting{GA_CT_CT_count}++; | |
5050 $alignment_read_1 = '-'; | |
5051 $alignment_read_2 = '+'; | |
5052 $read_conversion_info_1 = 'GA'; | |
5053 $read_conversion_info_2 = 'CT'; | |
5054 $genome_conversion = 'CT'; | |
5055 | |
5056 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented | |
5057 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
5058 if ($contains_deletion_1){ | |
5059 $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1); | |
5060 } | |
5061 } | |
5062 | |
5063 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only) | |
5064 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){ | |
5065 ### [Index 3, sequence originated from the (converted) reverse strand] | |
5066 $counting{CT_GA_GA_count}++; | |
5067 $alignment_read_1 = '-'; | |
5068 $alignment_read_2 = '+'; | |
5069 $read_conversion_info_1 = 'CT'; | |
5070 $read_conversion_info_2 = 'GA'; | |
5071 $genome_conversion = 'GA'; | |
5072 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented | |
5073 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1); | |
5074 if ($contains_deletion_1){ | |
5075 $genomic_seq_for_MD_tag_1 = reverse_complement($genomic_seq_for_MD_tag_1); | |
5076 } | |
5077 } | |
5078 else{ | |
5079 die "Too many bowtie result filehandles\n"; | |
5080 } | |
5081 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
5082 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
5083 | |
5084 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1; | |
5085 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2; | |
5086 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
5087 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1; | |
5088 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2; | |
5089 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1; | |
5090 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2; | |
5091 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_1} = $genomic_seq_for_MD_tag_1; | |
5092 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag_2} = $genomic_seq_for_MD_tag_2; | |
5093 | |
5094 ## the end position of a read is stored in $pos | |
5095 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1; | |
5096 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2; | |
5097 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1; | |
5098 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2; | |
5099 } | |
5100 | |
5101 ########################################## | |
5102 ### PRINT SINGLE END RESULTS: Bowtie 1 ### | |
5103 ########################################## | |
5104 | |
5105 sub print_bisulfite_mapping_result_single_end{ | |
5106 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; | |
5107 | |
5108 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
5109 if ($phred64){ | |
5110 $quality_value = convert_phred64_quals_to_phred33($quality_value); | |
5111 } | |
5112 elsif ($solexa){ | |
5113 $quality_value = convert_solexa_quals_to_phred33($quality_value); | |
5114 } | |
5115 | |
5116 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position. | |
5117 $methylation_call_params->{$identifier}->{position} += 1; | |
5118 | |
5119 ### writing every uniquely mapped read and its methylation call to the output file | |
5120 if ($vanilla){ | |
5121 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value); | |
5122 print OUT "$bowtie1_output\n"; | |
5123 } | |
5124 else{ # SAM output, default since Bismark v1.0.0 | |
5125 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script | |
5126 } | |
5127 } | |
5128 | |
5129 ########################################## | |
5130 ### PRINT SINGLE END RESULTS: Bowtie 2 ### | |
5131 ########################################## | |
5132 | |
5133 sub print_bisulfite_mapping_result_single_end_bowtie2{ | |
5134 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_; | |
5135 | |
5136 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
5137 if ($phred64){ | |
5138 $quality_value = convert_phred64_quals_to_phred33($quality_value); | |
5139 } | |
5140 elsif ($solexa){ | |
5141 $quality_value = convert_solexa_quals_to_phred33($quality_value); | |
5142 } | |
5143 | |
5144 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed) | |
5145 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script | |
5146 } | |
5147 | |
5148 ########################################## | |
5149 ### PRINT PAIRED END ESULTS: Bowtie 1 ### | |
5150 ########################################## | |
5151 | |
5152 sub print_bisulfite_mapping_results_paired_ends{ | |
5153 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; | |
5154 | |
5155 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
5156 if ($phred64){ | |
5157 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); | |
5158 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); | |
5159 } | |
5160 elsif ($solexa){ | |
5161 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); | |
5162 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); | |
5163 } | |
5164 | |
5165 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based) | |
5166 $methylation_call_params->{$identifier}->{start_seq_1} += 1; | |
5167 | |
5168 ### writing every single aligned read and its methylation call to the output file | |
5169 if ($vanilla){ | |
5170 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2); | |
5171 print OUT "$bowtie1_output_paired_end\n"; | |
5172 } | |
5173 else{ # SAM output, default since Bismark v1.0.0 | |
5174 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script | |
5175 } | |
5176 | |
5177 } | |
5178 | |
5179 ########################################## | |
5180 ### PRINT PAIRED END ESULTS: Bowtie 2 ### | |
5181 ########################################## | |
5182 | |
5183 sub print_bisulfite_mapping_results_paired_ends_bowtie2{ | |
5184 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_; | |
5185 | |
5186 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale) | |
5187 if ($phred64){ | |
5188 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1); | |
5189 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2); | |
5190 } | |
5191 elsif ($solexa){ | |
5192 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1); | |
5193 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2); | |
5194 } | |
5195 | |
5196 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed) | |
5197 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script | |
5198 | |
5199 } | |
5200 | |
5201 | |
5202 sub convert_phred64_quals_to_phred33{ | |
5203 | |
5204 my $qual = shift; | |
5205 my @quals = split (//,$qual); | |
5206 my @new_quals; | |
5207 | |
5208 foreach my $index (0..$#quals){ | |
5209 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]); | |
5210 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); | |
5211 $new_quals[$index] = $phred33_quality_string; | |
5212 } | |
5213 | |
5214 my $phred33_quality = join ("",@new_quals); | |
5215 return $phred33_quality; | |
5216 } | |
5217 | |
5218 sub convert_solexa_quals_to_phred33{ | |
5219 | |
5220 my $qual = shift; | |
5221 my @quals = split (//,$qual); | |
5222 my @new_quals; | |
5223 | |
5224 foreach my $index (0..$#quals){ | |
5225 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]); | |
5226 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score); | |
5227 $new_quals[$index] = $phred33_quality_string; | |
5228 } | |
5229 | |
5230 my $phred33_quality = join ("",@new_quals); | |
5231 return $phred33_quality; | |
5232 } | |
5233 | |
5234 sub convert_phred_score_into_phred33_quality_string{ | |
5235 my $qual = shift; | |
5236 $qual = chr($qual+33); | |
5237 return $qual; | |
5238 } | |
5239 | |
5240 sub convert_phred64_quality_string_into_phred_score{ | |
5241 my $string = shift; | |
5242 my $qual = ord($string)-64; | |
5243 return $qual; | |
5244 } | |
5245 | |
5246 sub convert_solexa_pre1_3_quality_string_into_phred_score{ | |
5247 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10 | |
5248 my $string = shift; | |
5249 my $qual = ord($string)-59; | |
5250 return $qual; | |
5251 } | |
5252 | |
5253 | |
5254 sub extract_corresponding_genomic_sequence_single_end { | |
5255 my ($sequence_identifier,$methylation_call_params) = @_; | |
5256 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the | |
5257 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
5258 | |
5259 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
5260 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
5261 my $alignment_strand; | |
5262 my $read_conversion_info; | |
5263 my $genome_conversion; | |
5264 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and | |
5265 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation, | |
5266 ### if the C happens to be at the last position of the actually observed sequence | |
5267 my $non_bisulfite_sequence; | |
5268 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end | |
5269 | |
5270 my $pbat_index_modifier = 0; | |
5271 | |
5272 if ($pbat){ | |
5273 $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1! | |
5274 } | |
5275 | |
5276 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) | |
5277 if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){ | |
5278 ### [Index 0, sequence originated from (converted) forward strand] | |
5279 $counting{CT_CT_count}++; | |
5280 $alignment_strand = '+'; | |
5281 $read_conversion_info = 'CT'; | |
5282 $genome_conversion = 'CT'; | |
5283 | |
5284 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5285 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1 | |
5286 ### + 2 extra base at the 3' end | |
5287 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 | |
5288 } | |
5289 else{ | |
5290 $non_bisulfite_sequence = ''; | |
5291 } | |
5292 } | |
5293 | |
5294 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) | |
5295 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){ | |
5296 ### [Index 1, sequence originated from (converted) reverse strand] | |
5297 $counting{CT_GA_count}++; | |
5298 $alignment_strand = '-'; | |
5299 $read_conversion_info = 'CT'; | |
5300 $genome_conversion = 'GA'; | |
5301 | |
5302 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5303 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from > | |
5304 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation | |
5305 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 | |
5306 ## reverse complement! | |
5307 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
5308 } | |
5309 else{ | |
5310 $non_bisulfite_sequence = ''; | |
5311 } | |
5312 } | |
5313 | |
5314 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) | |
5315 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){ | |
5316 ### [Index 2, sequence originated from complementary to (converted) forward strand] | |
5317 $counting{GA_CT_count}++; | |
5318 $alignment_strand = '-'; | |
5319 $read_conversion_info = 'GA'; | |
5320 $genome_conversion = 'CT'; | |
5321 | |
5322 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation | |
5323 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5324 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012 | |
5325 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2 | |
5326 ## reverse complement! | |
5327 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
5328 } | |
5329 else{ | |
5330 $non_bisulfite_sequence = ''; | |
5331 } | |
5332 } | |
5333 | |
5334 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) | |
5335 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){ | |
5336 ### [Index 3, sequence originated from complementary to (converted) reverse strand] | |
5337 $counting{GA_GA_count}++; | |
5338 $alignment_strand = '+'; | |
5339 $read_conversion_info = 'GA'; | |
5340 $genome_conversion = 'GA'; | |
5341 | |
5342 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5343 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from > | |
5344 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand | |
5345 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2 | |
5346 } | |
5347 else{ | |
5348 $non_bisulfite_sequence = ''; | |
5349 } | |
5350 } | |
5351 else{ | |
5352 die "Too many bowtie result filehandles\n"; | |
5353 } | |
5354 | |
5355 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; | |
5356 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; | |
5357 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
5358 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
5359 | |
5360 ### at this point we can also determine the end position of a read | |
5361 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence}); | |
5362 } | |
5363 | |
5364 | |
5365 sub extract_corresponding_genomic_sequence_single_end_bowtie2{ | |
5366 my ($sequence_identifier,$methylation_call_params) = @_; | |
5367 | |
5368 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{MD_tag}; | |
5369 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR}; | |
5370 | |
5371 my $contains_deletion = 0; | |
5372 if ($cigar =~ /D/){ | |
5373 $contains_deletion = 1; | |
5374 # warn "$cigar\n$MD_tag\n"; | |
5375 } | |
5376 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the | |
5377 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call | |
5378 | |
5379 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against, | |
5380 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions | |
5381 my $alignment_strand; | |
5382 my $read_conversion_info; | |
5383 my $genome_conversion; | |
5384 | |
5385 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and | |
5386 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence | |
5387 my $non_bisulfite_sequence = ''; | |
5388 my $genomic_seq_for_MD_tag = ''; # this sequence contains potential deletions in the genome as well so that we can generate a proper MD tag for the SAM output | |
5389 | |
5390 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings | |
5391 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1; | |
5392 | |
5393 # parsing CIGAR string | |
5394 my @len = split (/\D+/,$cigar); # storing the length per operation | |
5395 my @ops = split (/\d+/,$cigar); # storing the operation | |
5396 shift @ops; # remove the empty first element | |
5397 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
5398 | |
5399 my $pbat_index_modifier = 0; | |
5400 | |
5401 if ($pbat){ | |
5402 $pbat_index_modifier += 2; # (we are simply not running indexes 0 or 1! | |
5403 } | |
5404 | |
5405 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3) | |
5406 if ( (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1) or (($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3) ){ | |
5407 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5408 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise | |
5409 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
5410 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; | |
5411 return; | |
5412 } | |
5413 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2); | |
5414 } | |
5415 | |
5416 my $indels = 0; | |
5417 | |
5418 foreach (0..$#len){ | |
5419 if ($ops[$_] eq 'M'){ | |
5420 #extracting genomic sequence | |
5421 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); | |
5422 if ($contains_deletion){ | |
5423 $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); | |
5424 } | |
5425 # adjusting position | |
5426 $pos += $len[$_]; | |
5427 } | |
5428 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence | |
5429 # we simply add padding Xs instead of finding genomic sequence. This will not be used to infer methylation calls and we can later ignore it better during the generation of the MD:Z-tag | |
5430 $non_bisulfite_sequence .= 'X' x $len[$_]; | |
5431 if ($contains_deletion){ | |
5432 $genomic_seq_for_MD_tag .= 'X' x $len[$_]; | |
5433 } | |
5434 # warn "$non_bisulfite_sequence\n"; | |
5435 # position doesn't need to be adjusting | |
5436 | |
5437 ### 03 06 2014: In fact we don't need to add anything to the hemming distance for insertions since we use padding Xs which will fail the base by base comparison in hemming_dist() | |
5438 # $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) | |
5439 } | |
5440 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence | |
5441 # we do not add any genomic sequence but only adjust the position | |
5442 | |
5443 # we do however add the genomic sequence to the $genomic_sequence for MD-tag determination if the CIGAR string contained a deletion | |
5444 if ($contains_deletion){ | |
5445 $genomic_seq_for_MD_tag .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]); | |
5446 } | |
5447 $pos += $len[$_]; | |
5448 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions) | |
5449 } | |
5450 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die | |
5451 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
5452 } | |
5453 else{ | |
5454 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
5455 } | |
5456 } | |
5457 | |
5458 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2) | |
5459 if ( ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0) or ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2) ){ | |
5460 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome | |
5461 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise | |
5462 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
5463 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; | |
5464 return; | |
5465 } | |
5466 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2); | |
5467 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n"; | |
5468 } | |
5469 | |
5470 | |
5471 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only) | |
5472 if ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 0){ | |
5473 ### [Index 0, sequence originated from (converted) forward strand] | |
5474 $counting{CT_CT_count}++; | |
5475 $alignment_strand = '+'; | |
5476 $read_conversion_info = 'CT'; | |
5477 $genome_conversion = 'CT'; | |
5478 } | |
5479 | |
5480 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only) | |
5481 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 1){ | |
5482 ### [Index 1, sequence originated from (converted) reverse strand] | |
5483 $counting{CT_GA_count}++; | |
5484 $alignment_strand = '-'; | |
5485 $read_conversion_info = 'CT'; | |
5486 $genome_conversion = 'GA'; | |
5487 | |
5488 ### reverse complement! | |
5489 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
5490 if ($contains_deletion){ | |
5491 $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag); | |
5492 } | |
5493 } | |
5494 | |
5495 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only) | |
5496 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 2){ | |
5497 ### [Index 2, sequence originated from complementary to (converted) forward strand] | |
5498 $counting{GA_CT_count}++; | |
5499 $alignment_strand = '-'; | |
5500 $read_conversion_info = 'GA'; | |
5501 $genome_conversion = 'CT'; | |
5502 | |
5503 ### reverse complement! | |
5504 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence); | |
5505 if ($contains_deletion){ | |
5506 $genomic_seq_for_MD_tag = reverse_complement($genomic_seq_for_MD_tag); | |
5507 } | |
5508 } | |
5509 | |
5510 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only) | |
5511 elsif ( ($methylation_call_params->{$sequence_identifier}->{index} + $pbat_index_modifier) == 3){ | |
5512 ### [Index 3, sequence originated from complementary to (converted) reverse strand] | |
5513 $counting{GA_GA_count}++; | |
5514 $alignment_strand = '+'; | |
5515 $read_conversion_info = 'GA'; | |
5516 $genome_conversion = 'GA'; | |
5517 | |
5518 } | |
5519 else{ | |
5520 die "Too many Bowtie 2 result filehandles\n"; | |
5521 } | |
5522 | |
5523 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand; | |
5524 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info; | |
5525 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion; | |
5526 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence; | |
5527 $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag} = $genomic_seq_for_MD_tag; | |
5528 | |
5529 # if ($contains_deletion){ | |
5530 # warn "non-bis: $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence}\n"; | |
5531 # warn "MD-seq: $methylation_call_params->{$sequence_identifier}->{genomic_seq_for_MD_tag}\n"; | |
5532 # } | |
5533 | |
5534 ### the end position of a read is stored in $pos | |
5535 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos; | |
5536 $methylation_call_params->{$sequence_identifier}->{indels} = $indels; | |
5537 } | |
5538 | |
5539 ### METHYLATION CALL | |
5540 | |
5541 sub methylation_call{ | |
5542 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_; | |
5543 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one | |
5544 my @seq = split(//,$sequence_actually_observed); | |
5545 my @genomic = split(//,$genomic_sequence); | |
5546 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n"; | |
5547 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either | |
5548 ### CpG, CHH or CHG context | |
5549 | |
5550 ################################################################# | |
5551 ### . for bases not involving cytosines ### | |
5552 ### X for methylated C in CHG context (was protected) ### | |
5553 ### x for not methylated C in CHG context (was converted) ### | |
5554 ### H for methylated C in CHH context (was protected) ### | |
5555 ### h for not methylated C in CHH context (was converted) ### | |
5556 ### Z for methylated C in CpG context (was protected) ### | |
5557 ### z for not methylated C in CpG context (was converted) ### | |
5558 ### U for methylated C in unknown context (was protected) ### | |
5559 ### u for not methylated C in unknwon context (was converted) ### | |
5560 ################################################################# | |
5561 | |
5562 my @match =(); | |
5563 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2 | |
5564 my $methyl_CHH_count = 0; | |
5565 my $methyl_CHG_count = 0; | |
5566 my $methyl_CpG_count = 0; | |
5567 my $methyl_C_unknown_count = 0; | |
5568 my $unmethylated_CHH_count = 0; | |
5569 my $unmethylated_CHG_count = 0; | |
5570 my $unmethylated_CpG_count = 0; | |
5571 my $unmethylated_C_unknown_count = 0; | |
5572 | |
5573 if ($read_conversion eq 'CT'){ | |
5574 for my $index (0..$#seq) { | |
5575 if ($seq[$index] eq $genomic[$index]) { | |
5576 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation | |
5577 if ($genomic[$index] eq 'C') { | |
5578 ### If the residue is a C we want to know if it was in CpG context or in any other context | |
5579 my $downstream_base = $genomic[$index+1]; | |
5580 | |
5581 if ($downstream_base eq 'G'){ | |
5582 ++$methyl_CpG_count; | |
5583 push @match,'Z'; # protected C, methylated, in CpG context | |
5584 } | |
5585 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
5586 ++$methyl_C_unknown_count; | |
5587 push @match,'U'; # protected C, methylated, in Unknown context | |
5588 } | |
5589 else { | |
5590 ### C in not in CpG-context, determining the second downstream base context | |
5591 my $second_downstream_base = $genomic[$index+2]; | |
5592 | |
5593 if ($second_downstream_base eq 'G'){ | |
5594 ++$methyl_CHG_count; | |
5595 push @match,'X'; # protected C, methylated, in CHG context | |
5596 } | |
5597 elsif ($second_downstream_base eq 'N'){ | |
5598 ++$methyl_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
5599 push @match,'U'; # protected C, methylated, in Unknown context | |
5600 } | |
5601 else{ | |
5602 ++$methyl_CHH_count; | |
5603 push @match,'H'; # protected C, methylated, in CHH context | |
5604 } | |
5605 } | |
5606 } | |
5607 else { | |
5608 push @match, '.'; | |
5609 } | |
5610 } | |
5611 elsif ($seq[$index] ne $genomic[$index]) { | |
5612 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts | |
5613 ### in the actually observed sequence | |
5614 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') { | |
5615 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context | |
5616 my $downstream_base = $genomic[$index+1]; | |
5617 | |
5618 if ($downstream_base eq 'G'){ | |
5619 ++$unmethylated_CpG_count; | |
5620 push @match,'z'; # converted C, not methylated, in CpG context | |
5621 } | |
5622 elsif ($downstream_base eq 'N'){ # if the downstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
5623 ++$unmethylated_C_unknown_count; | |
5624 push @match,'u'; # converted C, not methylated, in Unknown context | |
5625 } | |
5626 else{ | |
5627 ### C in not in CpG-context, determining the second downstream base context | |
5628 my $second_downstream_base = $genomic[$index+2]; | |
5629 | |
5630 if ($second_downstream_base eq 'G'){ | |
5631 ++$unmethylated_CHG_count; | |
5632 push @match,'x'; # converted C, not methylated, in CHG context | |
5633 } | |
5634 elsif ($second_downstream_base eq 'N'){ | |
5635 ++$unmethylated_C_unknown_count; # if the second downstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
5636 push @match,'u'; # converted C, not methylated, in Unknown context | |
5637 } | |
5638 else{ | |
5639 ++$unmethylated_CHH_count; | |
5640 push @match,'h'; # converted C, not methylated, in CHH context | |
5641 } | |
5642 } | |
5643 } | |
5644 ### all other mismatches are not of interest for a methylation call | |
5645 else { | |
5646 push @match,'.'; | |
5647 } | |
5648 } | |
5649 else{ | |
5650 die "There can be only 2 possibilities\n"; | |
5651 } | |
5652 } | |
5653 } | |
5654 elsif ($read_conversion eq 'GA'){ | |
5655 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n"; | |
5656 | |
5657 for my $index (0..$#seq) { | |
5658 if ($seq[$index] eq $genomic[$index+2]) { | |
5659 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation | |
5660 if ($genomic[$index+2] eq 'G') { | |
5661 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need | |
5662 ### to look if the base upstream is a C | |
5663 | |
5664 my $upstream_base = $genomic[$index+1]; | |
5665 | |
5666 if ($upstream_base eq 'C'){ | |
5667 ++$methyl_CpG_count; | |
5668 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context | |
5669 } | |
5670 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
5671 ++$methyl_C_unknown_count; | |
5672 push @match,'U'; # protected C on opposing strand, methylated, in Unknown context | |
5673 } | |
5674 else{ | |
5675 ### C in not in CpG-context, determining the second upstream base context | |
5676 my $second_upstream_base = $genomic[$index]; | |
5677 | |
5678 if ($second_upstream_base eq 'C'){ | |
5679 ++$methyl_CHG_count; | |
5680 push @match,'X'; # protected C on opposing strand, methylated, in CHG context | |
5681 } | |
5682 elsif ($second_upstream_base eq 'N'){ | |
5683 ++$methyl_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
5684 push @match,'U'; # protected C, methylated, in Unknown context | |
5685 } | |
5686 else{ | |
5687 ++$methyl_CHH_count; | |
5688 push @match,'H'; # protected C on opposing strand, methylated, in CHH context | |
5689 } | |
5690 } | |
5691 } | |
5692 else{ | |
5693 push @match, '.'; | |
5694 } | |
5695 } | |
5696 elsif ($seq[$index] ne $genomic[$index+2]) { | |
5697 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts | |
5698 ### on the opposing strand, so G to A conversions in the actually observed sequence | |
5699 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') { | |
5700 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if | |
5701 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream! | |
5702 | |
5703 my $upstream_base = $genomic[$index+1]; | |
5704 | |
5705 if ($upstream_base eq 'C'){ | |
5706 ++$unmethylated_CpG_count; | |
5707 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context | |
5708 } | |
5709 elsif ($upstream_base eq 'N'){ # if the upstream base was an N we cannot really be sure about the sequence context (as it might have been a CG) | |
5710 ++$unmethylated_C_unknown_count; | |
5711 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context | |
5712 } | |
5713 else{ | |
5714 ### C in not in CpG-context, determining the second upstream base context | |
5715 my $second_upstream_base = $genomic[$index]; | |
5716 | |
5717 if ($second_upstream_base eq 'C'){ | |
5718 ++$unmethylated_CHG_count; | |
5719 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context | |
5720 } | |
5721 elsif ($second_upstream_base eq 'N'){ | |
5722 ++$unmethylated_C_unknown_count; # if the second upstream base was an N we cannot really be sure about the sequence context (as it might have been a CHH or CHG) | |
5723 push @match,'u'; # converted C on opposing strand, not methylated, in Unknown context | |
5724 } | |
5725 else{ | |
5726 ++$unmethylated_CHH_count; | |
5727 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context | |
5728 } | |
5729 } | |
5730 } | |
5731 ### all other mismatches are not of interest for a methylation call | |
5732 else { | |
5733 push @match,'.'; | |
5734 } | |
5735 } | |
5736 else{ | |
5737 die "There can be only 2 possibilities\n"; | |
5738 } | |
5739 } | |
5740 } | |
5741 else{ | |
5742 die "Strand conversion info is required to perform a methylation call\n"; | |
5743 } | |
5744 | |
5745 my $methylation_call = join ("",@match); | |
5746 | |
5747 $counting{total_meCHH_count} += $methyl_CHH_count; | |
5748 $counting{total_meCHG_count} += $methyl_CHG_count; | |
5749 $counting{total_meCpG_count} += $methyl_CpG_count; | |
5750 $counting{total_meC_unknown_count} += $methyl_C_unknown_count; | |
5751 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count; | |
5752 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count; | |
5753 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count; | |
5754 $counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count; | |
5755 | |
5756 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n"; | |
5757 | |
5758 return $methylation_call; | |
5759 } | |
5760 | |
5761 sub read_genome_into_memory{ | |
5762 | |
5763 ## working directoy | |
5764 my $cwd = shift; | |
5765 | |
5766 ## reading in and storing the specified genome in the %chromosomes hash | |
5767 chdir ($genome_folder) or die "Can't move to $genome_folder: $!"; | |
5768 warn "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n"; | |
5769 | |
5770 my @chromosome_filenames = <*.fa>; | |
5771 | |
5772 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta | |
5773 unless (@chromosome_filenames){ | |
5774 @chromosome_filenames = <*.fasta>; | |
5775 } | |
5776 | |
5777 unless (@chromosome_filenames){ | |
5778 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n"; | |
5779 } | |
5780 | |
5781 my $SQ_count = 0; | |
5782 | |
5783 foreach my $chromosome_filename (@chromosome_filenames){ | |
5784 | |
5785 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n"; | |
5786 ### first line needs to be a fastA header | |
5787 my $first_line = <CHR_IN>; | |
5788 chomp $first_line; | |
5789 $first_line =~ s/\r//; | |
5790 ### Extracting chromosome name from the FastA header | |
5791 my $chromosome_name = extract_chromosome_name($first_line); | |
5792 my $sequence; | |
5793 | |
5794 while (<CHR_IN>){ | |
5795 chomp; | |
5796 $_ =~ s/\r//; # removing carriage returns if present | |
5797 if ($_ =~ /^>/){ | |
5798 | |
5799 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA) | |
5800 if (exists $chromosomes{$chromosome_name}){ | |
5801 print "chr $chromosome_name (",length $sequence ," bp)\n"; | |
5802 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n"; | |
5803 } | |
5804 else { | |
5805 if (length($sequence) == 0){ | |
5806 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n"; | |
5807 } | |
5808 print "chr $chromosome_name (",length $sequence ," bp)\n"; | |
5809 $chromosomes{$chromosome_name} = $sequence; | |
5810 $SQ_order{$SQ_count} = $chromosome_name; | |
5811 | |
5812 ++$SQ_count; | |
5813 | |
5814 } | |
5815 ### resetting the sequence variable | |
5816 $sequence = ''; | |
5817 ### setting new chromosome name | |
5818 $chromosome_name = extract_chromosome_name($_); | |
5819 } | |
5820 else{ | |
5821 $sequence .= uc$_; | |
5822 } | |
5823 } | |
5824 | |
5825 ### Processing last chromosome of a multi Fasta File or the only entry in case of single entry FastA files | |
5826 | |
5827 if (exists $chromosomes{$chromosome_name}){ | |
5828 print "chr $chromosome_name (",length $sequence ," bp)\t"; | |
5829 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n"; | |
5830 } | |
5831 else{ | |
5832 if (length($sequence) == 0){ | |
5833 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n"; | |
5834 } | |
5835 | |
5836 ++$SQ_count; | |
5837 | |
5838 print "chr $chromosome_name (",length $sequence ," bp)\n"; | |
5839 $chromosomes{$chromosome_name} = $sequence; | |
5840 $SQ_order{$SQ_count} = $chromosome_name; | |
5841 } | |
5842 } | |
5843 print "\n"; | |
5844 chdir $cwd or die "Failed to move to directory $cwd\n"; | |
5845 | |
5846 ### If no single multi-FastA genome file was specified explicitely we will generate one here and write it to the output directory | |
5847 if ($cram){ | |
5848 unless (defined $cram_ref){ | |
5849 warn "Reconstituting a single multi-FastA genome file as CRAM reference (you may specify such a file using --cram_ref <file> explicitely to prevent this behaviour)\n"; | |
5850 | |
5851 $cram_ref = "${output_dir}Bismark_genome_CRAM_reference.mfa"; | |
5852 warn "Writing multi-FastA file to $cram_ref\n"; | |
5853 open (REF,'>',"$cram_ref") or die "Failed to write to file $cram_ref\n"; | |
5854 foreach my $chr(keys %chromosomes){ | |
5855 print REF ">$chr\n$chromosomes{$chr}\n"; | |
5856 } | |
5857 warn "Complete\n"; | |
5858 close REF or warn "Failed to close filehandle REF: $!\n"; | |
5859 } | |
5860 | |
5861 } | |
5862 | |
5863 | |
5864 | |
5865 } | |
5866 | |
5867 sub extract_chromosome_name { | |
5868 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well | |
5869 my $fasta_header = shift; | |
5870 if ($fasta_header =~ s/^>//){ | |
5871 my ($chromosome_name) = split (/\s+/,$fasta_header); | |
5872 return $chromosome_name; | |
5873 } | |
5874 else{ | |
5875 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n"; | |
5876 } | |
5877 } | |
5878 | |
5879 sub reverse_complement{ | |
5880 my $sequence = shift; | |
5881 $sequence =~ tr/CATG/GTAC/; | |
5882 $sequence = reverse($sequence); | |
5883 return $sequence; | |
5884 } | |
5885 | |
5886 sub biTransformFastAFiles { | |
5887 my $file = shift; | |
5888 my ($dir,$filename); | |
5889 if ($file =~ /\//){ | |
5890 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
5891 } | |
5892 else{ | |
5893 $filename = $file; | |
5894 } | |
5895 | |
5896 ### gzipped version of the infile | |
5897 if ($file =~ /\.gz$/){ | |
5898 open (IN,"gunzip -c $file |") or die "Couldn't read from file $file: $!\n"; | |
5899 } | |
5900 else{ | |
5901 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
5902 } | |
5903 | |
5904 if ($skip){ | |
5905 warn "Skipping the first $skip reads from $file\n"; | |
5906 sleep (1); | |
5907 } | |
5908 if ($upto){ | |
5909 warn "Processing reads up to sequence no. $upto from $file\n"; | |
5910 sleep (1); | |
5911 } | |
5912 | |
5913 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
5914 | |
5915 if ($gzip){ | |
5916 $C_to_T_infile =~ s/$/_C_to_T.fa.gz/; | |
5917 $G_to_A_infile =~ s/$/_G_to_A.fa.gz/; | |
5918 } | |
5919 else{ | |
5920 $C_to_T_infile =~ s/$/_C_to_T.fa/; | |
5921 $G_to_A_infile =~ s/$/_G_to_A.fa/; | |
5922 } | |
5923 | |
5924 if ($prefix){ | |
5925 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
5926 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
5927 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
5928 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
5929 } | |
5930 | |
5931 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
5932 | |
5933 if ($gzip){ | |
5934 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
5935 } | |
5936 else{ | |
5937 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
5938 } | |
5939 | |
5940 unless ($directional){ | |
5941 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
5942 if ($gzip){ | |
5943 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
5944 } | |
5945 else{ | |
5946 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
5947 } | |
5948 } | |
5949 | |
5950 my $count = 0; | |
5951 | |
5952 while (1){ | |
5953 my $header = <IN>; | |
5954 my $sequence= <IN>; | |
5955 last unless ($header and $sequence); | |
5956 | |
5957 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces | |
5958 | |
5959 ++$count; | |
5960 | |
5961 if ($skip){ | |
5962 next unless ($count > $skip); | |
5963 } | |
5964 if ($upto){ | |
5965 last if ($count > $upto); | |
5966 } | |
5967 | |
5968 $sequence = uc$sequence; # make input file case insensitive | |
5969 | |
5970 # detecting if the input file contains tab stops, as this is likely to result in no alignments | |
5971 if (index($header,"\t") != -1){ | |
5972 $seqID_contains_tabs++; | |
5973 } | |
5974 | |
5975 ### small check if the sequence seems to be in FastA format | |
5976 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/); | |
5977 | |
5978 my $sequence_C_to_T = $sequence; | |
5979 $sequence_C_to_T =~ tr/C/T/; | |
5980 print CTOT "$header$sequence_C_to_T"; | |
5981 | |
5982 unless ($directional){ | |
5983 my $sequence_G_to_A = $sequence; | |
5984 $sequence_G_to_A =~ tr/G/A/; | |
5985 print GTOA "$header$sequence_G_to_A"; | |
5986 } | |
5987 } | |
5988 close CTOT or die "Failed to close filehandle $!\n"; | |
5989 | |
5990 if ($directional){ | |
5991 warn "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n"; | |
5992 } | |
5993 else{ | |
5994 close GTOA or die "Failed to close filehandle $!\n"; | |
5995 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; | |
5996 } | |
5997 return ($C_to_T_infile,$G_to_A_infile); | |
5998 } | |
5999 | |
6000 sub biTransformFastAFiles_paired_end { | |
6001 my ($file,$read_number) = @_; | |
6002 | |
6003 if ($gzip){ | |
6004 warn "GZIP compression of temporary files is not supported for paired-end FastA data. Continuing to write uncompressed files\n"; | |
6005 sleep (2); | |
6006 } | |
6007 | |
6008 my ($dir,$filename); | |
6009 if ($file =~ /\//){ | |
6010 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
6011 } | |
6012 else{ | |
6013 $filename = $file; | |
6014 } | |
6015 | |
6016 ### gzipped version of the infile | |
6017 if ($file =~ /\.gz$/){ | |
6018 open (IN,"gunzip -c $file |") or die "Couldn't read from file $file: $!\n"; | |
6019 } | |
6020 else{ | |
6021 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
6022 } | |
6023 | |
6024 if ($skip){ | |
6025 warn "Skipping the first $skip reads from $file\n"; | |
6026 sleep (1); | |
6027 } | |
6028 if ($upto){ | |
6029 warn "Processing reads up to sequence no. $upto from $file\n"; | |
6030 sleep (1); | |
6031 } | |
6032 | |
6033 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
6034 | |
6035 $C_to_T_infile =~ s/$/_C_to_T.fa/; | |
6036 $G_to_A_infile =~ s/$/_G_to_A.fa/; | |
6037 | |
6038 if ($prefix){ | |
6039 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
6040 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
6041 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
6042 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
6043 } | |
6044 | |
6045 if ($directional){ | |
6046 if ($read_number == 1){ | |
6047 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
6048 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
6049 } | |
6050 elsif ($read_number == 2){ | |
6051 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
6052 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
6053 } | |
6054 else{ | |
6055 die "Read number needs to be 1 or 2, but was: $read_number\n\n"; | |
6056 } | |
6057 } | |
6058 else{ # all four strand output | |
6059 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
6060 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
6061 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
6062 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
6063 } | |
6064 | |
6065 my $count = 0; | |
6066 | |
6067 while (1){ | |
6068 my $header = <IN>; | |
6069 my $sequence= <IN>; | |
6070 last unless ($header and $sequence); | |
6071 | |
6072 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces | |
6073 | |
6074 ++$count; | |
6075 | |
6076 if ($skip){ | |
6077 next unless ($count > $skip); | |
6078 } | |
6079 if ($upto){ | |
6080 last if ($count > $upto); | |
6081 } | |
6082 | |
6083 $sequence = uc$sequence; # make input file case insensitive | |
6084 | |
6085 # detecting if the input file contains tab stops, as this is likely to result in no alignments | |
6086 if (index($header,"\t") != -1){ | |
6087 $seqID_contains_tabs++; | |
6088 } | |
6089 | |
6090 ## small check if the sequence seems to be in FastA format | |
6091 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>/); | |
6092 | |
6093 if ($read_number == 1){ | |
6094 if ($bowtie2){ | |
6095 $header =~ s/$/\/1\/1/; | |
6096 } | |
6097 else{ | |
6098 $header =~ s/$/\/1/; | |
6099 } | |
6100 } | |
6101 elsif ($read_number == 2){ | |
6102 if ($bowtie2){ | |
6103 $header =~ s/$/\/2\/2/; | |
6104 } | |
6105 else{ | |
6106 $header =~ s/$/\/2/; | |
6107 } | |
6108 } | |
6109 else{ | |
6110 die "Read number needs to be 1 or 2, but was: $read_number\n\n"; | |
6111 } | |
6112 my $sequence_C_to_T = my $sequence_G_to_A = $sequence; | |
6113 | |
6114 $sequence_C_to_T =~ tr/C/T/; | |
6115 $sequence_G_to_A =~ tr/G/A/; | |
6116 | |
6117 if ($directional){ | |
6118 | |
6119 if ($read_number == 1){ | |
6120 print CTOT "$header$sequence_C_to_T"; | |
6121 } | |
6122 elsif ($read_number == 2){ | |
6123 print GTOA "$header$sequence_G_to_A"; | |
6124 } | |
6125 } | |
6126 else{ | |
6127 print CTOT "$header$sequence_C_to_T"; | |
6128 print GTOA "$header$sequence_G_to_A"; | |
6129 } | |
6130 } | |
6131 | |
6132 if ($directional){ | |
6133 if ($read_number == 1){ | |
6134 warn "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n"; | |
6135 } | |
6136 else{ | |
6137 warn "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n"; | |
6138 } | |
6139 } | |
6140 else{ | |
6141 warn "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n"; | |
6142 } | |
6143 | |
6144 if ($directional){ | |
6145 if ($read_number == 1){ | |
6146 return ($C_to_T_infile); | |
6147 } | |
6148 else{ | |
6149 return ($G_to_A_infile); | |
6150 } | |
6151 } | |
6152 else{ | |
6153 return ($C_to_T_infile,$G_to_A_infile); | |
6154 } | |
6155 } | |
6156 | |
6157 | |
6158 sub biTransformFastQFiles { | |
6159 my $file = shift; | |
6160 my ($dir,$filename); | |
6161 if ($file =~ /\//){ | |
6162 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
6163 } | |
6164 else{ | |
6165 $filename = $file; | |
6166 } | |
6167 | |
6168 ### gzipped version of the infile | |
6169 if ($file =~ /\.gz$/){ | |
6170 open (IN,"gunzip -c $file |") or die "Couldn't read from file $file: $!\n"; | |
6171 } | |
6172 else{ | |
6173 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
6174 } | |
6175 | |
6176 if ($skip){ | |
6177 warn "Skipping the first $skip reads from $file\n"; | |
6178 sleep (1); | |
6179 } | |
6180 if ($upto){ | |
6181 warn "Processing reads up to sequence no. $upto from $file\n"; | |
6182 sleep (1); | |
6183 } | |
6184 | |
6185 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
6186 | |
6187 if ($prefix){ | |
6188 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
6189 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
6190 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
6191 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
6192 } | |
6193 | |
6194 if ($pbat){ # PBAT-Seq | |
6195 if ($gzip){ | |
6196 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | |
6197 } | |
6198 else{ | |
6199 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | |
6200 } | |
6201 | |
6202 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
6203 | |
6204 if ($gzip){ | |
6205 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
6206 } | |
6207 else{ | |
6208 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
6209 } | |
6210 } | |
6211 else{ # directional or non-directional | |
6212 if ($gzip){ | |
6213 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; | |
6214 } | |
6215 else{ | |
6216 $C_to_T_infile =~ s/$/_C_to_T.fastq/; | |
6217 } | |
6218 | |
6219 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
6220 | |
6221 if ($gzip){ | |
6222 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
6223 } | |
6224 else{ | |
6225 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; # uncompressed option | |
6226 } | |
6227 | |
6228 unless ($directional){ | |
6229 if ($gzip){ | |
6230 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | |
6231 } | |
6232 else{ | |
6233 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | |
6234 } | |
6235 | |
6236 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
6237 | |
6238 if ($gzip){ | |
6239 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
6240 } | |
6241 else{ | |
6242 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
6243 } | |
6244 } | |
6245 } | |
6246 | |
6247 my $count = 0; | |
6248 while (1){ | |
6249 my $identifier = <IN>; | |
6250 my $sequence = <IN>; | |
6251 my $identifier2 = <IN>; | |
6252 my $quality_score = <IN>; | |
6253 last unless ($identifier and $sequence and $identifier2 and $quality_score); | |
6254 | |
6255 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
6256 | |
6257 ++$count; | |
6258 | |
6259 if ($skip){ | |
6260 next unless ($count > $skip); | |
6261 } | |
6262 if ($upto){ | |
6263 last if ($count > $upto); | |
6264 } | |
6265 | |
6266 $sequence = uc$sequence; # make input file case insensitive | |
6267 | |
6268 # detecting if the input file contains tab stops, as this is likely to result in no alignments | |
6269 if (index($identifier,"\t") != -1){ | |
6270 $seqID_contains_tabs++; | |
6271 } | |
6272 | |
6273 ## small check if the sequence file appears to be a FastQ file | |
6274 if ($count == 1){ | |
6275 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ | |
6276 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
6277 } | |
6278 } | |
6279 | |
6280 if ($pbat){ | |
6281 my $sequence_G_to_A = $sequence; | |
6282 $sequence_G_to_A =~ tr/G/A/; | |
6283 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
6284 } | |
6285 else{ # directional or non-directional | |
6286 my $sequence_C_to_T = $sequence; | |
6287 $sequence_C_to_T =~ tr/C/T/; | |
6288 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); | |
6289 | |
6290 unless ($directional){ | |
6291 my $sequence_G_to_A = $sequence; | |
6292 $sequence_G_to_A =~ tr/G/A/; | |
6293 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
6294 } | |
6295 } | |
6296 } | |
6297 | |
6298 if ($directional){ | |
6299 close CTOT or die "Failed to close filehandle $!\n"; | |
6300 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
6301 } | |
6302 elsif($pbat){ | |
6303 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
6304 close GTOA or die "Failed to close filehandle $!\n"; | |
6305 return ($G_to_A_infile); | |
6306 } | |
6307 else{ | |
6308 close CTOT or die "Failed to close filehandle $!\n"; | |
6309 close GTOA or die "Failed to close filehandle $!\n"; | |
6310 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; | |
6311 } | |
6312 | |
6313 return ($C_to_T_infile,$G_to_A_infile); | |
6314 } | |
6315 | |
6316 sub biTransformFastQFiles_paired_end { | |
6317 my ($file,$read_number) = @_; | |
6318 my ($dir,$filename); | |
6319 | |
6320 if ($file =~ /\//){ | |
6321 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/; | |
6322 } | |
6323 else{ | |
6324 $filename = $file; | |
6325 } | |
6326 | |
6327 ### gzipped version of the infile | |
6328 if ($file =~ /\.gz$/){ | |
6329 open (IN,"gunzip -c $file |") or die "Couldn't read from file $file: $!\n"; | |
6330 } | |
6331 else{ | |
6332 open (IN,$file) or die "Couldn't read from file $file: $!\n"; | |
6333 } | |
6334 | |
6335 if ($skip){ | |
6336 warn "Skipping the first $skip reads from $file\n"; | |
6337 sleep (1); | |
6338 } | |
6339 if ($upto){ | |
6340 warn "Processing reads up to sequence no. $upto from $file\n"; | |
6341 sleep (1); | |
6342 } | |
6343 | |
6344 my $C_to_T_infile = my $G_to_A_infile = $filename; | |
6345 | |
6346 if ($gzip){ | |
6347 $C_to_T_infile =~ s/$/_C_to_T.fastq.gz/; | |
6348 $G_to_A_infile =~ s/$/_G_to_A.fastq.gz/; | |
6349 } | |
6350 else{ | |
6351 $C_to_T_infile =~ s/$/_C_to_T.fastq/; | |
6352 $G_to_A_infile =~ s/$/_G_to_A.fastq/; | |
6353 } | |
6354 | |
6355 if ($prefix){ | |
6356 # warn "Prefixing $prefix:\nold: $C_to_T_infile\nold: $G_to_A_infile\n\n"; | |
6357 $C_to_T_infile = "$prefix.$C_to_T_infile"; | |
6358 $G_to_A_infile = "$prefix.$G_to_A_infile"; | |
6359 # warn "Prefixing $prefix:\nnew: $C_to_T_infile\nnew: $G_to_A_infile\n\n"; | |
6360 } | |
6361 | |
6362 if ($directional){ | |
6363 if ($read_number == 1){ | |
6364 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
6365 if ($gzip){ | |
6366 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
6367 } | |
6368 else{ | |
6369 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
6370 } | |
6371 } | |
6372 elsif ($read_number == 2){ | |
6373 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
6374 if ($gzip){ | |
6375 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
6376 } | |
6377 else{ | |
6378 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
6379 } | |
6380 } | |
6381 else{ | |
6382 die "Read number needs to be 1 or 2, but was $read_number!\n\n"; | |
6383 } | |
6384 } | |
6385 else{ | |
6386 warn "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n"; | |
6387 warn "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n"; | |
6388 if ($gzip){ | |
6389 open (CTOT,"| gzip -c - > ${temp_dir}${C_to_T_infile}") or die "Can't write to file: $!\n"; | |
6390 open (GTOA,"| gzip -c - > ${temp_dir}${G_to_A_infile}") or die "Can't write to file: $!\n"; | |
6391 } | |
6392 else{ | |
6393 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n"; | |
6394 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n"; | |
6395 } | |
6396 } | |
6397 | |
6398 my $count = 0; | |
6399 while (1){ | |
6400 my $identifier = <IN>; | |
6401 my $sequence = <IN>; | |
6402 my $identifier2 = <IN>; | |
6403 my $quality_score = <IN>; | |
6404 last unless ($identifier and $sequence and $identifier2 and $quality_score); | |
6405 ++$count; | |
6406 | |
6407 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces | |
6408 | |
6409 if ($skip){ | |
6410 next unless ($count > $skip); | |
6411 } | |
6412 if ($upto){ | |
6413 last if ($count > $upto); | |
6414 } | |
6415 | |
6416 $sequence= uc$sequence; # make input file case insensitive | |
6417 | |
6418 ## small check if the sequence file appears to be a FastQ file | |
6419 if ($count == 1){ | |
6420 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){ | |
6421 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
6422 } | |
6423 } | |
6424 my $sequence_C_to_T = my $sequence_G_to_A = $sequence; | |
6425 | |
6426 if ($read_number == 1){ | |
6427 if ($bowtie2){ | |
6428 $identifier =~ s/$/\/1\/1/; | |
6429 } | |
6430 else{ | |
6431 $identifier =~ s/$/\/1/; | |
6432 } | |
6433 } | |
6434 elsif ($read_number == 2){ | |
6435 if ($bowtie2){ | |
6436 $identifier =~ s/$/\/2\/2/; | |
6437 } | |
6438 else{ | |
6439 $identifier =~ s/$/\/2/; | |
6440 } | |
6441 } | |
6442 else{ | |
6443 die "Read number needs to be 1 or 2\n"; | |
6444 } | |
6445 | |
6446 $sequence_C_to_T =~ tr/C/T/; | |
6447 $sequence_G_to_A =~ tr/G/A/; | |
6448 | |
6449 if ($directional){ | |
6450 if ($read_number == 1){ | |
6451 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); | |
6452 } | |
6453 else{ | |
6454 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
6455 } | |
6456 } | |
6457 else{ | |
6458 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score); | |
6459 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score); | |
6460 } | |
6461 } | |
6462 | |
6463 if ($directional){ | |
6464 if ($read_number == 1){ | |
6465 warn "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
6466 } | |
6467 else{ | |
6468 warn "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n"; | |
6469 } | |
6470 } | |
6471 else{ | |
6472 warn "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n"; | |
6473 } | |
6474 if ($directional){ | |
6475 if ($read_number == 1){ | |
6476 close CTOT or die "Failed to close filehandle $!\n"; | |
6477 return ($C_to_T_infile); | |
6478 } | |
6479 else{ | |
6480 close GTOA or die "Failed to close filehandle $!\n"; | |
6481 return ($G_to_A_infile); | |
6482 } | |
6483 } | |
6484 else{ | |
6485 close CTOT or die "Failed to close filehandle $!\n"; | |
6486 close GTOA or die "Failed to close filehandle $!\n"; | |
6487 return ($C_to_T_infile,$G_to_A_infile); | |
6488 } | |
6489 } | |
6490 | |
6491 | |
6492 ### SPECIAL BOWTIE 1 PAIRED-END FORMAT FOR GZIPPED OUTPUT FILES | |
6493 | |
6494 sub biTransformFastQFiles_paired_end_bowtie1_gzip { | |
6495 my ($file_1,$file_2) = @_; | |
6496 my ($dir,$filename); | |
6497 | |
6498 if ($file_1 =~ /\//){ | |
6499 ($dir,$filename) = $file_1 =~ m/(.*\/)(.*)$/; | |
6500 } | |
6501 else{ | |
6502 $filename = $file_1; | |
6503 } | |
6504 | |
6505 ### gzipped version of infile 1 | |
6506 if ($file_1 =~ /\.gz$/){ | |
6507 open (IN_1,"gunzip -c $file_1 |") or die "Couldn't read from file $file_1: $!\n"; | |
6508 } | |
6509 else{ | |
6510 open (IN_1,$file_1) or die "Couldn't read from file $file_1: $!\n"; | |
6511 } | |
6512 ### gzipped version of infile 2 | |
6513 if ($file_2 =~ /\.gz$/){ | |
6514 open (IN_2,"gunzip -c $file_2 |") or die "Couldn't read from file $file_2: $!\n"; | |
6515 } | |
6516 else{ | |
6517 open (IN_2,$file_2) or die "Couldn't read from file $file_2: $!\n"; | |
6518 } | |
6519 | |
6520 | |
6521 if ($skip){ | |
6522 warn "Skipping the first $skip reads from $file_1 and $file_2\n"; | |
6523 sleep (1); | |
6524 } | |
6525 if ($upto){ | |
6526 warn "Processing reads up to sequence no. $upto from $file_1 and $file_2\n"; | |
6527 sleep (1); | |
6528 } | |
6529 | |
6530 my $CT_plus_GA_infile = my $GA_plus_CT_infile = $filename; | |
6531 | |
6532 if ($prefix){ | |
6533 # warn "Prefixing $prefix:\nold: $CT_plus_GA_infile\nold: $GA_plus_CT_infile\n\n"; | |
6534 $CT_plus_GA_infile = "$prefix.$CT_plus_GA_infile"; | |
6535 $GA_plus_CT_infile = "$prefix.$GA_plus_CT_infile"; | |
6536 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; | |
6537 } | |
6538 | |
6539 $CT_plus_GA_infile =~ s/$/.CT_plus_GA.fastq.gz/; | |
6540 $GA_plus_CT_infile =~ s/$/.GA_plus_CT.fastq.gz/; | |
6541 # warn "Prefixing $prefix:\nnew: $CT_plus_GA_infile\nnew: $GA_plus_CT_infile\n\n"; | |
6542 | |
6543 warn "Writing a C -> T converted version of $file_1 and a G -> A converted version of $file_2 to $temp_dir$CT_plus_GA_infile\n"; | |
6544 open (CTPLUSGA,"| gzip -c - > ${temp_dir}${CT_plus_GA_infile}") or die "Can't write to file: $!\n"; | |
6545 # open (CTPLUSGA,'>',"$temp_dir$CT_plus_GA_infile") or die "Couldn't write to file $!\n"; | |
6546 | |
6547 unless ($directional){ | |
6548 print "Writing a G -> A converted version of $file_1 and a C -> T converted version of $file_2 to $temp_dir$GA_plus_CT_infile\n"; | |
6549 open (GAPLUSCT,"| gzip -c - > ${temp_dir}${GA_plus_CT_infile}") or die "Can't write to file: $!\n"; | |
6550 } | |
6551 | |
6552 ### for Bowtie 1 we need to write a single gzipped file with 1 line per pair of sequences in the the following format: | |
6553 ### <seq-ID> <sequence #1 mate> <quality #1 mate> <sequence #2 mate> <quality #2 mate> | |
6554 | |
6555 my $count = 0; | |
6556 while (1){ | |
6557 my $identifier_1 = <IN_1>; | |
6558 my $sequence_1 = <IN_1>; | |
6559 my $identifier2_1 = <IN_1>; | |
6560 my $quality_score_1 = <IN_1>; | |
6561 | |
6562 my $identifier_2 = <IN_2>; | |
6563 my $sequence_2 = <IN_2>; | |
6564 my $identifier2_2 = <IN_2>; | |
6565 my $quality_score_2 = <IN_2>; | |
6566 | |
6567 last unless ($identifier_1 and $sequence_1 and $identifier2_1 and $quality_score_1 and $identifier_2 and $sequence_2 and $identifier2_2 and $quality_score_2); | |
6568 | |
6569 ++$count; | |
6570 | |
6571 ## small check if the sequence file appears to be a FastQ file | |
6572 if ($count == 1){ | |
6573 if ($identifier_1 !~ /^\@/ or $identifier2_1 !~ /^\+/){ | |
6574 die "Input file 1 doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
6575 } | |
6576 if ($identifier_2 !~ /^\@/ or $identifier2_2 !~ /^\+/){ | |
6577 die "Input file 2 doesn't seem to be in FastQ format at sequence $count: $!\n"; | |
6578 } | |
6579 } | |
6580 | |
6581 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces | |
6582 chomp $identifier_1; | |
6583 chomp $sequence_1; | |
6584 chomp $sequence_2; | |
6585 chomp $quality_score_1; | |
6586 chomp $quality_score_2; | |
6587 | |
6588 $identifier_1 =~ s/^\@//; | |
6589 $identifier_1 =~ s/$/\/1/; #adding an extra /1 to the end which is being removed by Bowtie otherwise (which leads to no sequences alignments whatsoever) | |
6590 | |
6591 if ($skip){ | |
6592 next unless ($count > $skip); | |
6593 } | |
6594 if ($upto){ | |
6595 last if ($count > $upto); | |
6596 } | |
6597 | |
6598 $sequence_1 = uc$sequence_1; # make input file 1 case insensitive | |
6599 $sequence_2 = uc$sequence_2; # make input file 2 case insensitive | |
6600 | |
6601 # print "$identifier_1\t$sequence_1\t$quality_score_1\t$sequence_2\t$quality_score_2\n"; | |
6602 my $sequence_1_C_to_T = $sequence_1; | |
6603 my $sequence_2_G_to_A = $sequence_2; | |
6604 $sequence_1_C_to_T =~ tr/C/T/; | |
6605 $sequence_2_G_to_A =~ tr/G/A/; | |
6606 | |
6607 print CTPLUSGA "$identifier_1\t$sequence_1_C_to_T\t$quality_score_1\t$sequence_2_G_to_A\t$quality_score_2\n"; | |
6608 | |
6609 unless ($directional){ | |
6610 my $sequence_1_G_to_A = $sequence_1; | |
6611 my $sequence_2_C_to_T = $sequence_2; | |
6612 $sequence_1_G_to_A =~ tr/G/A/; | |
6613 $sequence_2_C_to_T =~ tr/C/T/; | |
6614 print GAPLUSCT "$identifier_1\t$sequence_1_G_to_A\t$quality_score_1\t$sequence_2_C_to_T\t$quality_score_2\n"; | |
6615 } | |
6616 } | |
6617 | |
6618 close CTPLUSGA or die "Couldn't close filehandle\n"; | |
6619 warn "\nCreated C -> T converted version of FastQ file '$file_1' and G -> A converted version of FastQ file '$file_2' ($count sequences in total)\n"; | |
6620 | |
6621 if ($directional){ | |
6622 warn "\n"; | |
6623 return ($CT_plus_GA_infile); | |
6624 } | |
6625 else{ | |
6626 close GAPLUSCT or die "Couldn't close filehandle\n"; | |
6627 warn "Created G -> A converted version of FastQ file '$file_1' and C -> T converted version of FastQ file '$file_2' ($count sequences in total)\n\n"; | |
6628 return ($CT_plus_GA_infile,$GA_plus_CT_infile); | |
6629 } | |
6630 } | |
6631 | |
6632 | |
6633 sub fix_IDs{ | |
6634 my $id = shift; | |
6635 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores | |
6636 return $id; | |
6637 } | |
6638 | |
6639 sub ensure_sensical_alignment_orientation_single_end{ | |
6640 my $index = shift; # index number if the sequence produced an alignment | |
6641 my $strand = shift; | |
6642 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one | |
6643 my $orientation = 0; | |
6644 ############################################################################################################## | |
6645 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T) | |
6646 ## here we only want reads in the forward (+) orientation | |
6647 if ($fhs[$index]->{name} eq 'CTreadCTgenome') { | |
6648 ### if the alignment is (+) we count it, and return 1 for a correct orientation | |
6649 if ($strand eq '+') { | |
6650 $fhs[$index]->{seen}++; | |
6651 $orientation = 1; | |
6652 return $orientation; | |
6653 } | |
6654 ### if the orientation equals (-) the alignment is nonsensical | |
6655 elsif ($strand eq '-') { | |
6656 $fhs[$index]->{wrong_strand}++; | |
6657 return $orientation; | |
6658 } | |
6659 } | |
6660 ############################################################################################################### | |
6661 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A) | |
6662 ## here we only want reads in the forward (-) orientation | |
6663 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') { | |
6664 ### if the alignment is (-) we count it and return 1 for a correct orientation | |
6665 if ($strand eq '-') { | |
6666 $fhs[$index]->{seen}++; | |
6667 $orientation = 1; | |
6668 return $orientation; | |
6669 } | |
6670 ### if the orientation equals (+) the alignment is nonsensical | |
6671 elsif ($strand eq '+') { | |
6672 $fhs[$index]->{wrong_strand}++; | |
6673 return $orientation; | |
6674 } | |
6675 } | |
6676 ############################################################################################################### | |
6677 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T) | |
6678 ## here we only want reads in the forward (-) orientation | |
6679 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') { | |
6680 ### if the alignment is (-) we count it and return 1 for a correct orientation | |
6681 if ($strand eq '-') { | |
6682 $fhs[$index]->{seen}++; | |
6683 $orientation = 1; | |
6684 return $orientation; | |
6685 } | |
6686 ### if the orientation equals (+) the alignment is nonsensical | |
6687 elsif ($strand eq '+') { | |
6688 $fhs[$index]->{wrong_strand}++; | |
6689 return $orientation; | |
6690 } | |
6691 } | |
6692 ############################################################################################################### | |
6693 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A) | |
6694 ## here we only want reads in the forward (+) orientation | |
6695 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') { | |
6696 ### if the alignment is (+) we count it and return 1 for a correct orientation | |
6697 if ($strand eq '+') { | |
6698 $fhs[$index]->{seen}++; | |
6699 $orientation = 1; | |
6700 return $orientation; | |
6701 } | |
6702 ### if the orientation equals (-) the alignment is nonsensical | |
6703 elsif ($strand eq '-') { | |
6704 $fhs[$index]->{wrong_strand}++; | |
6705 return $orientation; | |
6706 } | |
6707 } else{ | |
6708 die "One of the above conditions must be true\n"; | |
6709 } | |
6710 } | |
6711 | |
6712 sub ensure_sensical_alignment_orientation_paired_ends{ | |
6713 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment | |
6714 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one | |
6715 my $orientation = 0; | |
6716 ############################################################################################################## | |
6717 ## [Index 0, sequence originated from (converted) forward strand] | |
6718 ## CT converted read 1 | |
6719 ## GA converted read 2 | |
6720 ## CT converted genome | |
6721 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation | |
6722 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') { | |
6723 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation | |
6724 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
6725 $fhs[$index]->{seen}++; | |
6726 $orientation = 1; | |
6727 return $orientation; | |
6728 } | |
6729 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
6730 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
6731 $fhs[$index]->{wrong_strand}++; | |
6732 return $orientation; | |
6733 } | |
6734 else{ | |
6735 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
6736 } | |
6737 } | |
6738 ############################################################################################################### | |
6739 ## [Index 1, sequence originated from (converted) reverse strand] | |
6740 ## GA converted read 1 | |
6741 ## CT converted read 2 | |
6742 ## GA converted genome | |
6743 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation | |
6744 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') { | |
6745 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation | |
6746 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
6747 $fhs[$index]->{seen}++; | |
6748 $orientation = 1; | |
6749 return $orientation; | |
6750 } | |
6751 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
6752 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
6753 $fhs[$index]->{wrong_strand}++; | |
6754 return $orientation; | |
6755 } | |
6756 else{ | |
6757 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
6758 } | |
6759 } | |
6760 ############################################################################################################### | |
6761 ## [Index 2, sequence originated from complementary to (converted) forward strand] | |
6762 ## GA converted read 1 | |
6763 ## CT converted read 2 | |
6764 ## CT converted genome | |
6765 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation | |
6766 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') { | |
6767 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation | |
6768 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
6769 $fhs[$index]->{seen}++; | |
6770 $orientation = 1; | |
6771 return $orientation; | |
6772 } | |
6773 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
6774 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
6775 $fhs[$index]->{wrong_strand}++; | |
6776 return $orientation; | |
6777 } | |
6778 else{ | |
6779 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
6780 } | |
6781 } | |
6782 ############################################################################################################### | |
6783 ## [Index 3, sequence originated from complementary to (converted) reverse strand] | |
6784 ## CT converted read 1 | |
6785 ## GA converted read 2 | |
6786 ## GA converted genome | |
6787 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation | |
6788 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') { | |
6789 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation | |
6790 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') { | |
6791 $fhs[$index]->{seen}++; | |
6792 $orientation = 1; | |
6793 return $orientation; | |
6794 } | |
6795 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical | |
6796 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') { | |
6797 $fhs[$index]->{wrong_strand}++; | |
6798 return $orientation; | |
6799 } | |
6800 else{ | |
6801 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n"; | |
6802 } | |
6803 } | |
6804 else{ | |
6805 die "One of the above conditions must be true\n"; | |
6806 } | |
6807 } | |
6808 | |
6809 ##################################################################################################################################################### | |
6810 | |
6811 ### Bowtie 1 (default) | PAIRED-END | FASTA | |
6812 | |
6813 sub paired_end_align_fragments_to_bisulfite_genome_fastA { | |
6814 | |
6815 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
6816 | |
6817 if ($directional){ | |
6818 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; | |
6819 } | |
6820 else{ | |
6821 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; | |
6822 } | |
6823 | |
6824 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
6825 ## data structure above | |
6826 if ($directional){ | |
6827 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6828 } | |
6829 else{ | |
6830 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6831 } | |
6832 | |
6833 foreach my $fh (@fhs) { | |
6834 | |
6835 if ($directional){ | |
6836 unless ($fh->{inputfile_1}){ | |
6837 $fh->{last_seq_id} = undef; | |
6838 $fh->{last_line_1} = undef; | |
6839 $fh->{last_line_2} = undef; | |
6840 next; | |
6841 } | |
6842 } | |
6843 | |
6844 my $bt_options = $bowtie_options; | |
6845 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
6846 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
6847 } | |
6848 else { | |
6849 $bt_options .= ' --nofw'; | |
6850 } | |
6851 | |
6852 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n"; | |
6853 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
6854 | |
6855 my $line_1 = $fh->{fh}->getline(); | |
6856 my $line_2 = $fh->{fh}->getline(); | |
6857 | |
6858 # if Bowtie produces an alignment we store the first line of the output | |
6859 if ($line_1 and $line_2) { | |
6860 chomp $line_1; | |
6861 chomp $line_2; | |
6862 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
6863 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
6864 | |
6865 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
6866 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
6867 | |
6868 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present | |
6869 $fh->{last_seq_id} = $id_1; | |
6870 } | |
6871 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present | |
6872 $fh->{last_seq_id} = $id_2; | |
6873 } | |
6874 else{ | |
6875 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
6876 } | |
6877 | |
6878 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 | |
6879 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 | |
6880 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
6881 } | |
6882 # otherwise we just initialise last_seq_id and last_lines as undefined | |
6883 else { | |
6884 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
6885 $fh->{last_seq_id} = undef; | |
6886 $fh->{last_line_1} = undef; | |
6887 $fh->{last_line_2} = undef; | |
6888 } | |
6889 } | |
6890 } | |
6891 | |
6892 ### Bowtie 2 | PAIRED-END | FASTA | |
6893 | |
6894 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { | |
6895 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
6896 if ($directional){ | |
6897 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n"; | |
6898 } | |
6899 else{ | |
6900 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n"; | |
6901 } | |
6902 | |
6903 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
6904 ## data structure above | |
6905 if ($directional){ | |
6906 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6907 } | |
6908 else{ | |
6909 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
6910 } | |
6911 | |
6912 foreach my $fh (@fhs) { | |
6913 | |
6914 if ($directional){ | |
6915 unless ($fh->{inputfile_1}){ | |
6916 $fh->{last_seq_id} = undef; | |
6917 $fh->{last_line_1} = undef; | |
6918 $fh->{last_line_2} = undef; | |
6919 next; | |
6920 } | |
6921 } | |
6922 | |
6923 my $bt2_options = $bowtie_options; | |
6924 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
6925 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
6926 } | |
6927 else { | |
6928 $bt2_options .= ' --nofw'; | |
6929 } | |
6930 | |
6931 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; | |
6932 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
6933 | |
6934 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
6935 while (1){ | |
6936 $_ = $fh->{fh}->getline(); | |
6937 if ($_) { | |
6938 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
6939 } | |
6940 else{ | |
6941 last; # no alignment output | |
6942 } | |
6943 } | |
6944 | |
6945 my $line_1 = $_; | |
6946 my $line_2 = $fh->{fh}->getline(); | |
6947 | |
6948 # if Bowtie produces an alignment we store the first line of the output | |
6949 if ($line_1 and $line_2) { | |
6950 chomp $line_1; | |
6951 chomp $line_2; | |
6952 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
6953 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
6954 | |
6955 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
6956 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
6957 | |
6958 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with | |
6959 $fh->{last_seq_id} = $id_1; | |
6960 } | |
6961 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present | |
6962 $fh->{last_seq_id} = $id_2; | |
6963 } | |
6964 else{ | |
6965 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
6966 } | |
6967 | |
6968 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2 | |
6969 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2 | |
6970 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
6971 } | |
6972 # otherwise we just initialise last_seq_id and last_lines as undefined | |
6973 else { | |
6974 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
6975 $fh->{last_seq_id} = undef; | |
6976 $fh->{last_line_1} = undef; | |
6977 $fh->{last_line_2} = undef; | |
6978 } | |
6979 } | |
6980 } | |
6981 | |
6982 ### Bowtie 1 (default) | PAIRED-END | FASTQ | |
6983 | |
6984 sub paired_end_align_fragments_to_bisulfite_genome_fastQ { | |
6985 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
6986 | |
6987 if ($directional){ | |
6988 warn "Input file is $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; | |
6989 } | |
6990 elsif($pbat){ | |
6991 warn "Input file is $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ; PBAT-Seq)\n"; | |
6992 } | |
6993 else{ | |
6994 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 and $G_to_A_infile_1 and $C_to_T_infile_2 (non-directional; FastQ)\n"; | |
6995 } | |
6996 | |
6997 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the data structure above | |
6998 if ($directional or $pbat){ | |
6999 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7000 } | |
7001 else{ | |
7002 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7003 } | |
7004 | |
7005 foreach my $fh (@fhs) { | |
7006 | |
7007 if ($directional or $pbat){ | |
7008 unless ($fh->{inputfile_1}){ | |
7009 $fh->{last_seq_id} = undef; | |
7010 $fh->{last_line_1} = undef; | |
7011 $fh->{last_line_2} = undef; | |
7012 next; # skipping unwanted filehandles | |
7013 } | |
7014 } | |
7015 | |
7016 my $bt_options = $bowtie_options; | |
7017 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
7018 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
7019 } | |
7020 else { | |
7021 $bt_options .= ' --nofw'; | |
7022 } | |
7023 | |
7024 if ($gzip){ | |
7025 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1}, with the options: $bt_options)\n"; | |
7026 open ($fh->{fh},"gunzip -c ${temp_dir}$fh->{inputfile_1} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} --12 - |") or die "Can't open pipe to bowtie: $!"; | |
7027 } | |
7028 else{ | |
7029 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from ${temp_dir}$fh->{inputfile_1} and ${temp_dir}$fh->{inputfile_2}, with the options: $bt_options))\n"; | |
7030 sleep(5); | |
7031 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
7032 } | |
7033 | |
7034 my $line_1 = $fh->{fh}->getline(); | |
7035 my $line_2 = $fh->{fh}->getline(); | |
7036 | |
7037 # if Bowtie produces an alignment we store the first line of the output | |
7038 if ($line_1 and $line_2) { | |
7039 chomp $line_1; | |
7040 chomp $line_2; | |
7041 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
7042 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
7043 | |
7044 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
7045 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
7046 | |
7047 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present | |
7048 $fh->{last_seq_id} = $id_1; | |
7049 } | |
7050 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present | |
7051 $fh->{last_seq_id} = $id_2; | |
7052 } | |
7053 else{ | |
7054 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
7055 } | |
7056 | |
7057 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 | |
7058 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 | |
7059 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
7060 } | |
7061 | |
7062 # otherwise we just initialise last_seq_id and last_lines as undefined | |
7063 else { | |
7064 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
7065 $fh->{last_seq_id} = undef; | |
7066 $fh->{last_line_1} = undef; | |
7067 $fh->{last_line_2} = undef; | |
7068 } | |
7069 } | |
7070 } | |
7071 | |
7072 ### Bowtie 2 | PAIRED-END | FASTQ | |
7073 | |
7074 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { | |
7075 | |
7076 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_; | |
7077 if ($directional){ | |
7078 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n"; | |
7079 } | |
7080 elsif ($pbat){ | |
7081 warn "Input files are $G_to_A_infile_1 and $C_to_T_infile_2 (FastQ)\n"; | |
7082 } | |
7083 else{ | |
7084 warn "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n"; | |
7085 } | |
7086 | |
7087 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the | |
7088 ## data structure above | |
7089 if ($directional or $pbat){ | |
7090 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7091 } | |
7092 else{ | |
7093 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7094 } | |
7095 | |
7096 foreach my $fh (@fhs) { | |
7097 | |
7098 if ($directional or $pbat){ # skipping unwanted filehandles | |
7099 unless ($fh->{inputfile_1}){ | |
7100 $fh->{last_seq_id} = undef; | |
7101 $fh->{last_line_1} = undef; | |
7102 $fh->{last_line_2} = undef; | |
7103 next; | |
7104 } | |
7105 } | |
7106 | |
7107 my $bt2_options = $bowtie_options; | |
7108 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){ | |
7109 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
7110 } | |
7111 else { | |
7112 $bt2_options .= ' --nofw'; | |
7113 } | |
7114 | |
7115 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n"; | |
7116 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!"; | |
7117 | |
7118 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
7119 while (1){ | |
7120 $_ = $fh->{fh}->getline(); | |
7121 if ($_) { | |
7122 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
7123 } | |
7124 else{ | |
7125 last; # no alignment output | |
7126 } | |
7127 } | |
7128 | |
7129 my $line_1 = $_; | |
7130 my $line_2 = $fh->{fh}->getline(); | |
7131 | |
7132 # if Bowtie produces an alignment we store the first line of the output | |
7133 if ($line_1 and $line_2) { | |
7134 chomp $line_1; | |
7135 chomp $line_2; | |
7136 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2. | |
7137 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id | |
7138 | |
7139 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier) | |
7140 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line | |
7141 | |
7142 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with | |
7143 $fh->{last_seq_id} = $id_1; | |
7144 } | |
7145 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present | |
7146 $fh->{last_seq_id} = $id_2; | |
7147 } | |
7148 else{ | |
7149 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n"; | |
7150 } | |
7151 | |
7152 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2 | |
7153 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2 | |
7154 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n"; | |
7155 } | |
7156 | |
7157 # otherwise we just initialise last_seq_id and last_lines as undefined | |
7158 else { | |
7159 warn "Found no alignment, assigning undef to last_seq_id and last_lines\n"; | |
7160 $fh->{last_seq_id} = undef; | |
7161 $fh->{last_line_1} = undef; | |
7162 $fh->{last_line_2} = undef; | |
7163 } | |
7164 } | |
7165 } | |
7166 | |
7167 ##################################################################################################################################################### | |
7168 | |
7169 ### Bowtie 1 (default) | SINGLE-END | FASTA | |
7170 sub single_end_align_fragments_to_bisulfite_genome_fastA { | |
7171 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
7172 if ($directional){ | |
7173 warn "Input file is $C_to_T_infile (FastA)\n"; | |
7174 } | |
7175 else{ | |
7176 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; | |
7177 } | |
7178 | |
7179 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
7180 ## data structure above | |
7181 if ($directional){ | |
7182 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7183 } | |
7184 else{ | |
7185 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7186 } | |
7187 | |
7188 foreach my $fh (@fhs) { | |
7189 | |
7190 my $bt_options = $bowtie_options; | |
7191 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
7192 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
7193 } | |
7194 else { | |
7195 $bt_options .= ' --nofw'; | |
7196 } | |
7197 | |
7198 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; | |
7199 if ($gzip){ | |
7200 open ($fh->{fh},"gunzip -c $temp_dir$fh->{inputfile} | $path_to_bowtie $bt_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; | |
7201 } | |
7202 else{ | |
7203 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data | |
7204 } | |
7205 | |
7206 # if Bowtie produces an alignment we store the first line of the output | |
7207 $_ = $fh->{fh}->getline(); | |
7208 if ($_) { | |
7209 chomp; | |
7210 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier) | |
7211 $fh->{last_seq_id} = $id; | |
7212 $fh->{last_line} = $_; | |
7213 warn "Found first alignment:\t$fh->{last_line}\n"; | |
7214 } | |
7215 # otherwise we just initialise last_seq_id and last_line as undefined | |
7216 else { | |
7217 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
7218 $fh->{last_seq_id} = undef; | |
7219 $fh->{last_line} = undef; | |
7220 } | |
7221 } | |
7222 } | |
7223 | |
7224 ### Bowtie 2 | SINGLE-END | FASTA | |
7225 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 { | |
7226 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
7227 if ($directional){ | |
7228 warn "Input file is $C_to_T_infile (FastA)\n"; | |
7229 } | |
7230 else{ | |
7231 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n"; | |
7232 } | |
7233 | |
7234 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
7235 ## data structure above | |
7236 if ($directional){ | |
7237 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7238 } | |
7239 else{ | |
7240 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7241 } | |
7242 | |
7243 foreach my $fh (@fhs) { | |
7244 | |
7245 my $bt2_options = $bowtie_options; | |
7246 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
7247 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
7248 } | |
7249 else { | |
7250 $bt2_options .= ' --nofw'; | |
7251 } | |
7252 | |
7253 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n"; | |
7254 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie 2: $!"; | |
7255 | |
7256 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
7257 while (1){ | |
7258 $_ = $fh->{fh}->getline(); | |
7259 if ($_) { | |
7260 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
7261 } | |
7262 else{ | |
7263 last; # no alignment output | |
7264 } | |
7265 } | |
7266 | |
7267 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output | |
7268 if ($_) { | |
7269 chomp; | |
7270 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) | |
7271 $fh->{last_seq_id} = $id; | |
7272 $fh->{last_line} = $_; | |
7273 warn "Found first alignment:\t$fh->{last_line}\n"; | |
7274 } | |
7275 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output | |
7276 else { | |
7277 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
7278 $fh->{last_seq_id} = undef; | |
7279 $fh->{last_line} = undef; | |
7280 } | |
7281 } | |
7282 } | |
7283 | |
7284 | |
7285 ### Bowtie 1 (default) | SINGLE-END | FASTQ | |
7286 sub single_end_align_fragments_to_bisulfite_genome_fastQ { | |
7287 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
7288 if ($directional){ | |
7289 warn "Input file is $C_to_T_infile (FastQ)\n"; | |
7290 } | |
7291 elsif($pbat){ | |
7292 warn "Input file is $G_to_A_infile (FastQ)\n"; | |
7293 } | |
7294 else{ | |
7295 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n"; | |
7296 } | |
7297 | |
7298 | |
7299 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
7300 ## the data structure above | |
7301 if ($directional or $pbat){ | |
7302 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7303 } | |
7304 else{ | |
7305 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7306 } | |
7307 | |
7308 foreach my $fh (@fhs) { | |
7309 my $bt_options = $bowtie_options; | |
7310 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
7311 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
7312 } | |
7313 else { | |
7314 $bt_options .= ' --nofw'; | |
7315 } | |
7316 | |
7317 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n"; | |
7318 sleep (5); | |
7319 | |
7320 if ($gzip){ | |
7321 open ($fh->{fh},"gunzip -c $temp_dir$fh->{inputfile} | $path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} - |") or die "Can't open pipe to bowtie: $!"; | |
7322 } | |
7323 else{ | |
7324 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; # command for uncompressed data | |
7325 } | |
7326 | |
7327 # if Bowtie produces an alignment we store the first line of the output | |
7328 $_ = $fh->{fh}->getline(); | |
7329 if ($_) { | |
7330 chomp; | |
7331 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier) | |
7332 $fh->{last_seq_id} = $id; | |
7333 $fh->{last_line} = $_; | |
7334 warn "Found first alignment:\t$fh->{last_line}\n"; | |
7335 } | |
7336 # otherwise we just initialise last_seq_id and last_line as undefined | |
7337 else { | |
7338 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
7339 $fh->{last_seq_id} = undef; | |
7340 $fh->{last_line} = undef; | |
7341 } | |
7342 } | |
7343 } | |
7344 | |
7345 ### Bowtie 2 | SINGLE-END | FASTQ | |
7346 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 { | |
7347 | |
7348 my ($C_to_T_infile,$G_to_A_infile) = @_; | |
7349 if ($directional){ | |
7350 warn "Input file is $C_to_T_infile (FastQ)\n\n"; | |
7351 } | |
7352 elsif ($pbat){ | |
7353 warn "Input file is $G_to_A_infile (FastQ)\n\n"; | |
7354 } | |
7355 else{ | |
7356 warn "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n"; | |
7357 } | |
7358 | |
7359 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in | |
7360 ## the data structure above | |
7361 if ($directional or $pbat){ | |
7362 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7363 } | |
7364 else{ | |
7365 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n"; | |
7366 } | |
7367 | |
7368 foreach my $fh (@fhs) { | |
7369 my $bt2_options = $bowtie_options; | |
7370 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){ | |
7371 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner | |
7372 } | |
7373 else { | |
7374 $bt2_options .= ' --nofw'; | |
7375 } | |
7376 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n"; | |
7377 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n"; | |
7378 | |
7379 open ($fh->{fh},"$path_to_bowtie $bt2_options -x $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!"; | |
7380 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence | |
7381 while (1){ | |
7382 $_ = $fh->{fh}->getline(); | |
7383 # warn "$_\n"; | |
7384 # sleep(1); | |
7385 if ($_) { | |
7386 last unless ($_ =~ /^\@/); # SAM headers start with @ | |
7387 } | |
7388 else { | |
7389 last; | |
7390 } | |
7391 } | |
7392 | |
7393 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output | |
7394 if ($_) { | |
7395 chomp; | |
7396 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier) | |
7397 $fh->{last_seq_id} = $id; | |
7398 $fh->{last_line} = $_; | |
7399 warn "Found first alignment:\t$fh->{last_line}\n"; | |
7400 # warn "storing $id and\n$_\n"; | |
7401 } | |
7402 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output | |
7403 else { | |
7404 warn "Found no alignment, assigning undef to last_seq_id and last_line\n"; | |
7405 $fh->{last_seq_id} = undef; | |
7406 $fh->{last_line} = undef; | |
7407 } | |
7408 } | |
7409 } | |
7410 | |
7411 ########################################################################################################################################### | |
7412 | |
7413 sub reset_counters_and_fhs{ | |
7414 my $filename = shift; | |
7415 %counting=( | |
7416 total_meCHH_count => 0, | |
7417 total_meCHG_count => 0, | |
7418 total_meCpG_count => 0, | |
7419 total_meC_unknown_count => 0, | |
7420 total_unmethylated_CHH_count => 0, | |
7421 total_unmethylated_CHG_count => 0, | |
7422 total_unmethylated_CpG_count => 0, | |
7423 total_unmethylated_C_unknown_count => 0, | |
7424 sequences_count => 0, | |
7425 no_single_alignment_found => 0, | |
7426 unsuitable_sequence_count => 0, | |
7427 genomic_sequence_could_not_be_extracted_count => 0, | |
7428 unique_best_alignment_count => 0, | |
7429 low_complexity_alignments_overruled_count => 0, | |
7430 CT_CT_count => 0, #(CT read/CT genome, original top strand) | |
7431 CT_GA_count => 0, #(CT read/GA genome, original bottom strand) | |
7432 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand) | |
7433 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand) | |
7434 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand) | |
7435 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand) | |
7436 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand) | |
7437 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand) | |
7438 alignments_rejected_count => 0, # only relevant if --directional was specified | |
7439 ); | |
7440 | |
7441 if ($directional){ | |
7442 if ($filename =~ ','){ # paired-end files | |
7443 @fhs=( | |
7444 { name => 'CTreadCTgenome', | |
7445 strand_identity => 'con ori forward', | |
7446 bisulfiteIndex => $CT_index_basename, | |
7447 seen => 0, | |
7448 wrong_strand => 0, | |
7449 }, | |
7450 { name => 'CTreadGAgenome', | |
7451 strand_identity => 'con ori reverse', | |
7452 bisulfiteIndex => $GA_index_basename, | |
7453 seen => 0, | |
7454 wrong_strand => 0, | |
7455 }, | |
7456 { name => 'GAreadCTgenome', | |
7457 strand_identity => 'compl ori con forward', | |
7458 bisulfiteIndex => $CT_index_basename, | |
7459 seen => 0, | |
7460 wrong_strand => 0, | |
7461 }, | |
7462 { name => 'GAreadGAgenome', | |
7463 strand_identity => 'compl ori con reverse', | |
7464 bisulfiteIndex => $GA_index_basename, | |
7465 seen => 0, | |
7466 wrong_strand => 0, | |
7467 }, | |
7468 ); | |
7469 } | |
7470 else{ # single-end files | |
7471 @fhs=( | |
7472 { name => 'CTreadCTgenome', | |
7473 strand_identity => 'con ori forward', | |
7474 bisulfiteIndex => $CT_index_basename, | |
7475 seen => 0, | |
7476 wrong_strand => 0, | |
7477 }, | |
7478 { name => 'CTreadGAgenome', | |
7479 strand_identity => 'con ori reverse', | |
7480 bisulfiteIndex => $GA_index_basename, | |
7481 seen => 0, | |
7482 wrong_strand => 0, | |
7483 }, | |
7484 ); | |
7485 } | |
7486 } | |
7487 elsif($pbat){ | |
7488 if ($filename =~ ','){ # paired-end files | |
7489 @fhs=( | |
7490 { name => 'CTreadCTgenome', | |
7491 strand_identity => 'con ori forward', | |
7492 bisulfiteIndex => $CT_index_basename, | |
7493 seen => 0, | |
7494 wrong_strand => 0, | |
7495 }, | |
7496 { name => 'CTreadGAgenome', | |
7497 strand_identity => 'con ori reverse', | |
7498 bisulfiteIndex => $GA_index_basename, | |
7499 seen => 0, | |
7500 wrong_strand => 0, | |
7501 }, | |
7502 { name => 'GAreadCTgenome', | |
7503 strand_identity => 'compl ori con forward', | |
7504 bisulfiteIndex => $CT_index_basename, | |
7505 seen => 0, | |
7506 wrong_strand => 0, | |
7507 }, | |
7508 { name => 'GAreadGAgenome', | |
7509 strand_identity => 'compl ori con reverse', | |
7510 bisulfiteIndex => $GA_index_basename, | |
7511 seen => 0, | |
7512 wrong_strand => 0, | |
7513 }, | |
7514 ); | |
7515 } | |
7516 else{ # single-end files | |
7517 @fhs=( | |
7518 { name => 'GAreadCTgenome', | |
7519 strand_identity => 'compl ori con forward', | |
7520 bisulfiteIndex => $CT_index_basename, | |
7521 seen => 0, | |
7522 wrong_strand => 0, | |
7523 }, | |
7524 { name => 'GAreadGAgenome', | |
7525 strand_identity => 'compl ori con reverse', | |
7526 bisulfiteIndex => $GA_index_basename, | |
7527 seen => 0, | |
7528 wrong_strand => 0, | |
7529 }, | |
7530 ); | |
7531 } | |
7532 } | |
7533 else{ | |
7534 @fhs=( | |
7535 { name => 'CTreadCTgenome', | |
7536 strand_identity => 'con ori forward', | |
7537 bisulfiteIndex => $CT_index_basename, | |
7538 seen => 0, | |
7539 wrong_strand => 0, | |
7540 }, | |
7541 { name => 'CTreadGAgenome', | |
7542 strand_identity => 'con ori reverse', | |
7543 bisulfiteIndex => $GA_index_basename, | |
7544 seen => 0, | |
7545 wrong_strand => 0, | |
7546 }, | |
7547 { name => 'GAreadCTgenome', | |
7548 strand_identity => 'compl ori con forward', | |
7549 bisulfiteIndex => $CT_index_basename, | |
7550 seen => 0, | |
7551 wrong_strand => 0, | |
7552 }, | |
7553 { name => 'GAreadGAgenome', | |
7554 strand_identity => 'compl ori con reverse', | |
7555 bisulfiteIndex => $GA_index_basename, | |
7556 seen => 0, | |
7557 wrong_strand => 0, | |
7558 }, | |
7559 ); | |
7560 } | |
7561 } | |
7562 | |
7563 | |
7564 sub process_command_line{ | |
7565 my @bowtie_options; | |
7566 my $help; | |
7567 my $mates1; | |
7568 my $mates2; | |
7569 my $path_to_bowtie; | |
7570 my $fastq; | |
7571 my $fasta; | |
7572 my $skip; | |
7573 my $qupto; | |
7574 my $phred64; | |
7575 my $phred33; | |
7576 my $solexa; | |
7577 my $mismatches; | |
7578 my $seed_length; | |
7579 my $best; | |
7580 my $sequence_format; | |
7581 my $version; | |
7582 my $quiet; | |
7583 my $chunk; | |
7584 my $non_directional; | |
7585 my $ceiling; | |
7586 my $maxins; | |
7587 my $minins; | |
7588 my $unmapped; | |
7589 my $multi_map; | |
7590 my $output_dir; | |
7591 my $bowtie2; | |
7592 my $vanilla; | |
7593 my $sam_no_hd; | |
7594 my $seed_extension_fails; | |
7595 my $reseed_repetitive_seeds; | |
7596 my $most_valid_alignments; | |
7597 my $score_min; | |
7598 my $parallel; | |
7599 my $temp_dir; | |
7600 my $rdg; | |
7601 my $rfg; | |
7602 my $non_bs_mm; | |
7603 my $samtools_path; | |
7604 my $bam; | |
7605 my $gzip; | |
7606 my $pbat; | |
7607 my $prefix; | |
7608 my $old_flag; | |
7609 my $basename; | |
7610 my $sam; | |
7611 my $multicore; | |
7612 my $bowtie1; | |
7613 my $rg_tag; | |
7614 my $rg_id; | |
7615 my $rg_sample; | |
7616 my $genome_folder; | |
7617 my $singles; | |
7618 my $ambig_bam; | |
7619 my $cram; | |
7620 my $cram_ref; | |
7621 my $nucleotide_coverage; | |
7622 my $dovetail; | |
7623 | |
7624 my $command_line = GetOptions ('help|man' => \$help, | |
7625 '1=s' => \$mates1, | |
7626 '2=s' => \$mates2, | |
7627 'path_to_bowtie=s' => \$path_to_bowtie, | |
7628 'genome_folder=s' => \$genome_folder, | |
7629 'f|fasta' => \$fasta, | |
7630 'q|fastq' => \$fastq, | |
7631 's|skip=i' => \$skip, | |
7632 'u|upto=i' => \$qupto, | |
7633 'phred33-quals' => \$phred33, | |
7634 'phred64-quals|solexa1' => \$phred64, | |
7635 'solexa-quals' => \$solexa, | |
7636 'n|seedmms=i' => \$mismatches, | |
7637 'l|seedlen=i' => \$seed_length, | |
7638 'no_best' => \$best, | |
7639 'version' => \$version, | |
7640 'quiet' => \$quiet, | |
7641 'chunkmbs=i' => \$chunk, | |
7642 'non_directional' => \$non_directional, | |
7643 'I|minins=i' => \$minins, | |
7644 'X|maxins=i' => \$maxins, | |
7645 'e|maqerr=i' => \$ceiling, | |
7646 'un|unmapped' => \$unmapped, | |
7647 'ambiguous' => \$multi_map, | |
7648 'o|output_dir=s' => \$output_dir, | |
7649 'bowtie2' => \$bowtie2, | |
7650 'bowtie1' => \$bowtie1, | |
7651 'vanilla' => \$vanilla, | |
7652 'sam-no-hd' => \$sam_no_hd, | |
7653 'D=i' => \$seed_extension_fails, | |
7654 'R=i' => \$reseed_repetitive_seeds, | |
7655 'score_min=s' => \$score_min, | |
7656 'most_valid_alignments=i' => \$most_valid_alignments, | |
7657 'p=i' => \$parallel, | |
7658 'temp_dir=s' => \$temp_dir, | |
7659 'rdg=s' => \$rdg, | |
7660 'rfg=s' => \$rfg, | |
7661 'non_bs_mm' => \$non_bs_mm, | |
7662 'samtools_path=s' => \$samtools_path, | |
7663 'bam' => \$bam, | |
7664 'gzip' => \$gzip, | |
7665 'pbat' => \$pbat, | |
7666 'prefix=s' => \$prefix, | |
7667 'old_flag' => \$old_flag, | |
7668 'B|basename=s' => \$basename, | |
7669 'sam' => \$sam, | |
7670 'multicore=i' => \$multicore, | |
7671 'rg_tag' => \$rg_tag, | |
7672 'rg_id=s' => \$rg_id, | |
7673 'rg_sample=s' => \$rg_sample, | |
7674 'se|single_end=s' => \$singles, | |
7675 'ambig_bam' => \$ambig_bam, | |
7676 'cram' => \$cram, | |
7677 'cram_ref=s' => \$cram_ref, | |
7678 'nucleotide_coverage' => \$nucleotide_coverage, | |
7679 'dovetail' => \$dovetail, | |
7680 ); | |
7681 | |
7682 | |
7683 ### EXIT ON ERROR if there were errors with any of the supplied options | |
7684 unless ($command_line){ | |
7685 die "Please respecify command line options\n"; | |
7686 } | |
7687 ### HELPFILE | |
7688 if ($help){ | |
7689 print_helpfile(); | |
7690 exit; | |
7691 } | |
7692 if ($version){ | |
7693 print << "VERSION"; | |
7694 | |
7695 | |
7696 Bismark - Bisulfite Mapper and Methylation Caller. | |
7697 | |
7698 Bismark Version: $bismark_version | |
7699 Copyright 2010-15 Felix Krueger, Babraham Bioinformatics | |
7700 www.bioinformatics.babraham.ac.uk/projects/ | |
7701 | |
7702 | |
7703 VERSION | |
7704 exit; | |
7705 } | |
7706 | |
7707 | |
7708 ########################## | |
7709 ### PROCESSING OPTIONS ### | |
7710 ########################## | |
7711 | |
7712 if ($bowtie1){ | |
7713 $bowtie2 = 0; | |
7714 } | |
7715 else{ # Bowtie 2 is now the default mode (as of 27 July 2015) | |
7716 $bowtie2 = 1; | |
7717 } | |
7718 | |
7719 unless ($sam_no_hd){ | |
7720 $sam_no_hd =0; | |
7721 } | |
7722 | |
7723 ### PATH TO BOWTIE | |
7724 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH | |
7725 if ($path_to_bowtie){ | |
7726 unless ($path_to_bowtie =~ /\/$/){ | |
7727 $path_to_bowtie =~ s/$/\//; | |
7728 } | |
7729 if (-d $path_to_bowtie){ | |
7730 if ($bowtie2){ | |
7731 $path_to_bowtie = "${path_to_bowtie}bowtie2"; | |
7732 } | |
7733 else{ | |
7734 $path_to_bowtie = "${path_to_bowtie}bowtie"; | |
7735 } | |
7736 } | |
7737 else{ | |
7738 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n"; | |
7739 } | |
7740 } | |
7741 else{ | |
7742 if ($bowtie2){ | |
7743 $path_to_bowtie = 'bowtie2'; | |
7744 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; } | |
7745 else{ | |
7746 $path_to_bowtie = 'bowtie'; | |
7747 warn "Path to Bowtie specified as: $path_to_bowtie\n"; | |
7748 } | |
7749 } | |
7750 | |
7751 | |
7752 if ($sam){ | |
7753 warn "Output format manually set as SAM\n"; | |
7754 } | |
7755 elsif($cram){ | |
7756 warn "Output format set to CRAM\n"; | |
7757 if (defined $cram_ref){ | |
7758 warn "CRAM reference given as: '$cram_ref'\n\n"; | |
7759 unless (-e $cram_ref){ | |
7760 die "There is a problem with the CRAM reference '$cram_ref': $!\n\n"; | |
7761 } | |
7762 | |
7763 # determining full path information for the cram reference | |
7764 if ($cram_ref =~/\//){ | |
7765 if (chdir $cram_ref){ | |
7766 my $absolute_cram_ref_folder = getcwd; ## making the genome folder path absolute | |
7767 unless ($absolute_cram_ref_folder =~/\/$/){ | |
7768 $absolute_cram_ref_folder =~ s/$/\//; | |
7769 } | |
7770 } | |
7771 } | |
7772 } | |
7773 else{ | |
7774 warn "CRAM reference not specified explicitely, regenerating from FastA reference files\n\n"; | |
7775 } | |
7776 } | |
7777 else{ | |
7778 $bam = 1; | |
7779 warn "Output format is BAM (default)\n"; | |
7780 } | |
7781 | |
7782 ### OUTPUT REQUESTED AS BAM FILE (default) | |
7783 if ($bam or $cram){ | |
7784 if ($vanilla){ | |
7785 die "Specifying BAM output is not compatible with \"--vanilla\" format. Please respecify\n\n"; | |
7786 } | |
7787 | |
7788 ### PATH TO SAMTOOLS | |
7789 if (defined $samtools_path){ | |
7790 # if Samtools was specified as full command | |
7791 if ($samtools_path =~ /samtools$/){ | |
7792 if (-e $samtools_path){ | |
7793 # Samtools executable found | |
7794 } | |
7795 else{ | |
7796 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
7797 } | |
7798 } | |
7799 else{ | |
7800 unless ($samtools_path =~ /\/$/){ | |
7801 $samtools_path =~ s/$/\//; | |
7802 } | |
7803 $samtools_path .= 'samtools'; | |
7804 if (-e $samtools_path){ | |
7805 # Samtools executable found | |
7806 } | |
7807 else{ | |
7808 die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n"; | |
7809 } | |
7810 } | |
7811 | |
7812 warn "Alignments will be written out in BAM format. Samtools path provided as: '$samtools_path'\n"; | |
7813 $bam = 1; | |
7814 } | |
7815 # Check whether Samtools is in the PATH if no path was supplied by the user | |
7816 else{ | |
7817 if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if samtools is in the PATH | |
7818 $samtools_path = `which samtools`; | |
7819 chomp $samtools_path; | |
7820 warn "Alignments will be written out in BAM format. Samtools found here: '$samtools_path'\n"; sleep(1); | |
7821 $bam = 1; | |
7822 } | |
7823 } | |
7824 | |
7825 unless (defined $samtools_path){ | |
7826 $bam = 2; | |
7827 warn "Did not find Samtools on the system. Alignments will be compressed with GZIP instead (.sam.gz)\n"; | |
7828 } | |
7829 sleep (1); | |
7830 } | |
7831 | |
7832 ### OPTION AMBIGUOUS BAM | |
7833 if ($ambig_bam){ | |
7834 unless ($bowtie2){ | |
7835 die "The option --ambig_bam is only available for Bowtie2 alignments\n"; | |
7836 } | |
7837 } | |
7838 | |
7839 | |
7840 #################################### | |
7841 ### PROCESSING ARGUMENTS | |
7842 | |
7843 ### GENOME FOLDER | |
7844 if (defined $genome_folder){ # 25 11 2015 The genome folder may now also be defined with the option --genome_folder | |
7845 # warn "Genome folder specified with --genome_folder $genome_folder\n"; | |
7846 } | |
7847 else{ | |
7848 $genome_folder = shift @ARGV; # mandatory | |
7849 } | |
7850 | |
7851 unless ($genome_folder){ | |
7852 warn "Genome folder was not specified!\n"; | |
7853 print_helpfile(); | |
7854 exit; | |
7855 } | |
7856 | |
7857 ### checking that the genome folder, all subfolders and the required bowtie index files exist | |
7858 unless ($genome_folder =~/\/$/){ | |
7859 $genome_folder =~ s/$/\//; | |
7860 } | |
7861 | |
7862 if (chdir $genome_folder){ | |
7863 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute | |
7864 unless ($absolute_genome_folder =~/\/$/){ | |
7865 $absolute_genome_folder =~ s/$/\//; | |
7866 } | |
7867 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n"; | |
7868 $genome_folder = $absolute_genome_folder; | |
7869 } | |
7870 else{ | |
7871 die "Failed to move to $genome_folder: $!\nUSAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n"; | |
7872 } | |
7873 | |
7874 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/"; | |
7875 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/"; | |
7876 | |
7877 my $bt2_small_index_present = 1; | |
7878 my $bt2_large_index_present = 1; | |
7879 | |
7880 my $bt_small_index_present = 1; | |
7881 my $bt_large_index_present = 1; | |
7882 | |
7883 if ($bowtie2){ ### Bowtie 2 | |
7884 | |
7885 ### Checking for small indixes first (ending in .bt2) | |
7886 | |
7887 # checking the integrity of $CT_dir | |
7888 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | |
7889 | |
7890 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2'); | |
7891 foreach my $file(@CT_bowtie_index){ | |
7892 unless (-f $file){ | |
7893 warn "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n"; | |
7894 $bt2_small_index_present = 0; | |
7895 } | |
7896 } | |
7897 # checking the integrity of $GA_dir | |
7898 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | |
7899 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2'); | |
7900 | |
7901 foreach my $file(@GA_bowtie_index){ | |
7902 unless (-f $file){ | |
7903 warn "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n"; | |
7904 $bt2_small_index_present = 0; | |
7905 } | |
7906 } | |
7907 | |
7908 ### Using the small index preferentially | |
7909 if ($bt2_small_index_present){ | |
7910 $bt2_large_index_present = 0; | |
7911 } | |
7912 else{ # only checking for large indexes if the 'normal' one can't be found | |
7913 warn "\nCouldn't find a traditional small Bowtie 2 index for the genome specified (ending in .bt2). Now searching for a large index instead (64-bit index ending in .bt2l)...\n"; | |
7914 | |
7915 ### If no small small indexes were found we look for large indexes (64-bit indexes, ending in .bt2l) | |
7916 | |
7917 # checking the integrity of $CT_dir | |
7918 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | |
7919 | |
7920 @CT_bowtie_index = ('BS_CT.1.bt2l','BS_CT.2.bt2l','BS_CT.3.bt2l','BS_CT.4.bt2l','BS_CT.rev.1.bt2l','BS_CT.rev.2.bt2l'); | |
7921 foreach my $file(@CT_bowtie_index){ | |
7922 unless (-f $file){ | |
7923 die "The Bowtie 2 index of the C->T converted genome seems to be faulty or non-existant ('$file'). Please run the bismark_genome_preparation before running Bismark\n"; | |
7924 $bt2_large_index_present = 0; } | |
7925 } | |
7926 | |
7927 ### checking the integrity of $GA_dir | |
7928 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | |
7929 @GA_bowtie_index = ('BS_GA.1.bt2l','BS_GA.2.bt2l','BS_GA.3.bt2l','BS_GA.4.bt2l','BS_GA.rev.1.bt2l','BS_GA.rev.2.bt2l'); | |
7930 | |
7931 foreach my $file(@GA_bowtie_index){ | |
7932 unless (-f $file){ | |
7933 die "The Bowtie 2 index of the G->A converted genome seems to be faulty or non-existant ('$file'). Please run bismark_genome_preparation before running Bismark\n"; | |
7934 $bt2_large_index_present = 0; | |
7935 } | |
7936 } | |
7937 | |
7938 if ($bt2_large_index_present){ | |
7939 warn "64-bit large genome Bowtie 2 index found...\n"; | |
7940 } | |
7941 else{ | |
7942 die "Failed to detect either a standard (.bt2) or 64-bit (.bt2l) Bowtie 2 index for the genome specified. Please run the bismark_genome_preparation before launching Bismark\n\n"; | |
7943 } | |
7944 } | |
7945 | |
7946 } | |
7947 | |
7948 else{ ### Bowtie 1 | |
7949 ### Checking for small indixes first (ending in .ebwt) | |
7950 ### checking the integrity of $CT_dir | |
7951 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | |
7952 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt'); | |
7953 foreach my $file(@CT_bowtie_index){ | |
7954 unless (-f $file){ | |
7955 warn "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation --bowtie1 before running Bismark.\n"; | |
7956 $bt_small_index_present = 0; | |
7957 } | |
7958 } | |
7959 | |
7960 ### checking the integrity of $GA_dir | |
7961 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | |
7962 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt'); | |
7963 foreach my $file(@GA_bowtie_index){ | |
7964 unless (-f $file){ | |
7965 warn "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation --bowtie1 before running Bismark.\n"; | |
7966 $bt_small_index_present = 0; | |
7967 } | |
7968 } | |
7969 | |
7970 ### Using the small index preferentially | |
7971 if ($bt_small_index_present){ | |
7972 $bt_large_index_present = 0; | |
7973 } | |
7974 else{ # only checking for large indexes if the 'normal' one can't be found | |
7975 warn "\nCouldn't find a traditional small Bowtie index for the genome specified (ending in .ebwt). Now searching for a large index instead (64-bit index ending in .ebwtl)...\n"; | |
7976 | |
7977 ### If no small small indexes were found we look for large indexes (64-bit indexes, ending in .ebwtl) | |
7978 | |
7979 ### checking the integrity of $CT_dir | |
7980 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n"; | |
7981 my @CT_bowtie_index = ('BS_CT.1.ebwtl','BS_CT.2.ebwtl','BS_CT.3.ebwtl','BS_CT.4.ebwtl','BS_CT.rev.1.ebwtl','BS_CT.rev.2.ebwtl'); | |
7982 foreach my $file(@CT_bowtie_index){ | |
7983 unless (-f $file){ | |
7984 warn "The Bowtie index of the C->T converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation --bowtie1 before running Bismark.\n"; | |
7985 $bt_large_index_present = 0; | |
7986 } | |
7987 } | |
7988 | |
7989 ### checking the integrity of $GA_dir | |
7990 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n"; | |
7991 my @GA_bowtie_index = ('BS_GA.1.ebwtl','BS_GA.2.ebwtl','BS_GA.3.ebwtl','BS_GA.4.ebwtl','BS_GA.rev.1.ebwtl','BS_GA.rev.2.ebwtl'); | |
7992 foreach my $file(@GA_bowtie_index){ | |
7993 unless (-f $file){ | |
7994 warn "The Bowtie index of the G->A converted genome seems to be faulty ($file doesn't exist). Please run bismark_genome_preparation --bowtie1 before running Bismark.\n"; | |
7995 $bt_large_index_present = 0; | |
7996 } | |
7997 } | |
7998 | |
7999 if ($bt_large_index_present){ | |
8000 warn "64-bit large genome Bowtie index found...\n"; | |
8001 } | |
8002 else{ | |
8003 die "Failed to detect either a standard (.ebwt) or 64-bit (.ebwtl) Bowtie index for the genome specified. Please run the bismark_genome_preparation --bowtie1 before launching Bismark\n\n"; | |
8004 } | |
8005 } | |
8006 | |
8007 } | |
8008 | |
8009 my $CT_index_basename = "${CT_dir}BS_CT"; | |
8010 my $GA_index_basename = "${GA_dir}BS_GA"; | |
8011 | |
8012 ### INPUT OPTIONS | |
8013 | |
8014 ### SEQUENCE FILE FORMAT | |
8015 ### exits if both fastA and FastQ were specified | |
8016 if ($fasta and $fastq){ | |
8017 die "Only one sequence filetype can be specified (fastA or fastQ)\n"; | |
8018 } | |
8019 | |
8020 ### unless fastA is specified explicitely, fastQ sequence format is expected by default | |
8021 if ($fasta){ | |
8022 print "FastA format specified\n"; | |
8023 $sequence_format = 'FASTA'; | |
8024 push @bowtie_options, '-f'; | |
8025 } | |
8026 elsif ($fastq){ | |
8027 print "FastQ format specified\n"; | |
8028 $sequence_format = 'FASTQ'; | |
8029 push @bowtie_options, '-q'; | |
8030 } | |
8031 else{ | |
8032 $fastq = 1; | |
8033 print "FastQ format assumed (by default)\n"; | |
8034 $sequence_format = 'FASTQ'; | |
8035 push @bowtie_options, '-q'; | |
8036 } | |
8037 | |
8038 ### SKIP | |
8039 if ($skip){ | |
8040 warn "Skipping the first $skip reads from the input file\n"; | |
8041 # push @bowtie_options,"-s $skip"; | |
8042 } | |
8043 | |
8044 ### UPTO | |
8045 if ($qupto){ | |
8046 warn "Processing sequences up to read no. $qupto from the input file\n"; | |
8047 if ($bowtie2){ | |
8048 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2 | |
8049 } | |
8050 else{ | |
8051 # push @bowtie_options,"--qupto $qupto"; | |
8052 } | |
8053 } | |
8054 | |
8055 ### QUALITY VALUES | |
8056 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){ | |
8057 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)"; | |
8058 } | |
8059 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2. | |
8060 # Phred quality values work only when -q is specified | |
8061 unless ($fastq){ | |
8062 die "Phred quality values works only when -q (FASTQ) is specified\n"; | |
8063 } | |
8064 if ($bowtie2){ | |
8065 push @bowtie_options,"--phred33"; | |
8066 } | |
8067 else{ | |
8068 push @bowtie_options,"--phred33-quals"; | |
8069 } | |
8070 } | |
8071 if ($phred64){ | |
8072 # Phred quality values work only when -q is specified | |
8073 unless ($fastq){ | |
8074 die "Phred quality values work only when -q (FASTQ) is specified\n"; | |
8075 } | |
8076 if ($bowtie2){ | |
8077 push @bowtie_options,"--phred64"; | |
8078 } | |
8079 else{ | |
8080 push @bowtie_options,"--phred64-quals"; | |
8081 } | |
8082 } | |
8083 else{ | |
8084 $phred64 = 0; | |
8085 } | |
8086 | |
8087 if ($solexa){ | |
8088 if ($bowtie2){ | |
8089 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n"; | |
8090 } | |
8091 # Solexa to Phred value conversion works only when -q is specified | |
8092 unless ($fastq){ | |
8093 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n"; | |
8094 } | |
8095 push @bowtie_options,"--solexa-quals"; | |
8096 } | |
8097 else{ | |
8098 $solexa = 0; | |
8099 } | |
8100 | |
8101 ### ALIGNMENT OPTIONS | |
8102 | |
8103 ### MISMATCHES | |
8104 if (defined $mismatches){ | |
8105 if ($bowtie2){ | |
8106 if ($mismatches == 0 or $mismatches == 1){ | |
8107 push @bowtie_options,"-N $mismatches"; | |
8108 } | |
8109 else{ | |
8110 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n"; | |
8111 } | |
8112 } | |
8113 else{ | |
8114 if ($mismatches >= 0 and $mismatches <= 3){ | |
8115 push @bowtie_options,"-n $mismatches"; | |
8116 } | |
8117 else{ | |
8118 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n"; | |
8119 } | |
8120 } | |
8121 } | |
8122 else{ | |
8123 unless ($bowtie2){ | |
8124 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2 | |
8125 } | |
8126 } | |
8127 | |
8128 ### SEED LENGTH | |
8129 if (defined $seed_length){ | |
8130 if ($bowtie2){ | |
8131 push @bowtie_options,"-L $seed_length"; | |
8132 } | |
8133 else{ | |
8134 push @bowtie_options,"-l $seed_length"; | |
8135 } | |
8136 } | |
8137 | |
8138 ### MISMATCH CEILING | |
8139 if (defined $ceiling){ | |
8140 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2); | |
8141 push @bowtie_options,"-e $ceiling"; | |
8142 } | |
8143 | |
8144 | |
8145 ### BOWTIE 2 EFFORT OPTIONS | |
8146 | |
8147 ### CONSECUTIVE SEED EXTENSION FAILS | |
8148 if (defined $seed_extension_fails){ | |
8149 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
8150 push @bowtie_options,"-D $seed_extension_fails"; | |
8151 } | |
8152 | |
8153 ### RE-SEEDING REPETITIVE SEEDS | |
8154 if (defined $reseed_repetitive_seeds){ | |
8155 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
8156 push @bowtie_options,"-R $reseed_repetitive_seeds"; | |
8157 } | |
8158 | |
8159 | |
8160 ### BOWTIE 2 SCORING OPTIONS | |
8161 | |
8162 my ($score_min_intercept, $score_min_slope); | |
8163 | |
8164 if ($score_min){ | |
8165 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
8166 | |
8167 unless ($score_min =~ /^L,(.+),(.+)$/){ | |
8168 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; | |
8169 } | |
8170 ($score_min_intercept, $score_min_slope) = ($1, $2); | |
8171 push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2 | |
8172 } | |
8173 else{ | |
8174 if ($bowtie2){ | |
8175 ($score_min_intercept, $score_min_slope) = (0, -0.2); | |
8176 push @bowtie_options,"--score-min L,$score_min_intercept,$score_min_slope"; # default setting, more stringent than normal Bowtie2 | |
8177 } | |
8178 } | |
8179 | |
8180 ### BOWTIE 2 READ GAP OPTIONS | |
8181 my ($insertion_open,$insertion_extend,$deletion_open,$deletion_extend); | |
8182 | |
8183 if ($rdg){ | |
8184 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
8185 if ($rdg =~ /^(\d+),(\d+)$/){ | |
8186 $deletion_open = $1; | |
8187 $deletion_extend = $2; | |
8188 } | |
8189 else{ | |
8190 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; | |
8191 } | |
8192 push @bowtie_options,"--rdg $rdg"; | |
8193 } | |
8194 else{ | |
8195 $deletion_open = 5; | |
8196 $deletion_extend = 3; | |
8197 } | |
8198 | |
8199 ### BOWTIE 2 REFERENCE GAP OPTIONS | |
8200 if ($rfg){ | |
8201 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2); | |
8202 if ($rfg =~ /^(\d+),(\d+)$/){ | |
8203 $insertion_open = $1; | |
8204 $insertion_extend = $2; | |
8205 } | |
8206 else{ | |
8207 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n"; | |
8208 } | |
8209 push @bowtie_options,"--rfg $rfg"; | |
8210 } | |
8211 else{ | |
8212 $insertion_open = 5; | |
8213 $insertion_extend = 3; | |
8214 } | |
8215 | |
8216 | |
8217 ### BOWTIE 2 PARALLELIZATION OPTIONS | |
8218 if (defined $parallel){ | |
8219 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2); | |
8220 } | |
8221 if ($bowtie2){ | |
8222 if ($parallel){ | |
8223 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1); | |
8224 if ($parallel > 4){ | |
8225 warn "Attention: using more than 4 cores per alignment thread has been reported to have diminishing returns. If possible try to limit -p to a value of 4\n"; sleep(2); | |
8226 } | |
8227 push @bowtie_options,"-p $parallel"; | |
8228 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work. | |
8229 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n"; | |
8230 sleep (2); | |
8231 } | |
8232 } | |
8233 | |
8234 ### REPORTING OPTIONS | |
8235 | |
8236 if ($bowtie2){ | |
8237 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default | |
8238 | |
8239 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while | |
8240 if(defined $most_valid_alignments){ | |
8241 | |
8242 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n"; | |
8243 } | |
8244 } | |
8245 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1 | |
8246 push @bowtie_options,'-k 2'; | |
8247 } | |
8248 | |
8249 ### --BEST | |
8250 if ($bowtie2){ | |
8251 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used | |
8252 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n"; | |
8253 } | |
8254 } | |
8255 else{ | |
8256 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process) | |
8257 unless ($best){ | |
8258 push @bowtie_options,'--best'; | |
8259 } | |
8260 } | |
8261 | |
8262 ### VANILLA BISMARK (BOWTIE 1) OUTPUT | |
8263 if ($vanilla){ | |
8264 if ($bowtie2){ | |
8265 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n"; | |
8266 } | |
8267 } | |
8268 else{ | |
8269 $vanilla = 0; | |
8270 } | |
8271 | |
8272 ### PAIRED-END MAPPING | |
8273 if ($mates1){ | |
8274 | |
8275 if (defined $singles){ # if --single_end has been set explicitely | |
8276 die "You cannot set --single_end and supply files in paired-end format (-1 <mates1> -2 <mates2>). Please respecify!\n"; | |
8277 } | |
8278 | |
8279 my @mates1 = (split (/,/,$mates1)); | |
8280 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2); | |
8281 my @mates2 = (split(/,/,$mates2)); | |
8282 unless (scalar @mates1 == scalar @mates2){ | |
8283 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n"; | |
8284 } | |
8285 while (1){ | |
8286 my $mate1 = shift @mates1; | |
8287 my $mate2 = shift @mates2; | |
8288 last unless ($mate1 and $mate2); | |
8289 push @filenames,"$mate1,$mate2"; | |
8290 } | |
8291 if ($bowtie2){ | |
8292 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments | |
8293 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones | |
8294 | |
8295 if ($pbat){ | |
8296 $dovetail = 1; # setting the option $dovetail for PBAT paired-end alignments | |
8297 } | |
8298 | |
8299 if ($dovetail){ | |
8300 if ($old_flag){ | |
8301 die "The option --dovetail may only be specified with the current SAM FLAG values. Please respecify...\n"; | |
8302 } | |
8303 push @bowtie_options,'--dovetail'; ## 07 03 2016 Adding the option --dovetail, mainly for PBAT alignments | |
8304 } | |
8305 } | |
8306 | |
8307 if ($old_flag){ | |
8308 warn "\nUsing FLAG values for paired-end SAM output used up to Bismark v0.8.2. In addition, paired-end sequences will have /1 and /2 appended to their read IDs\n\n" unless($vanilla); | |
8309 sleep(3); | |
8310 } | |
8311 } | |
8312 elsif ($mates2){ | |
8313 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n"; | |
8314 } | |
8315 | |
8316 ### SINGLE-END MAPPING | |
8317 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified | |
8318 | |
8319 unless ($mates1 and $mates2){ | |
8320 if (defined $singles){ # if --single_end has been set explicitely | |
8321 warn "Mapping set to single-end mode (user defined). File names need to be separated by commas [,] or colons [:]! Supplied file names are: $singles\n"; | |
8322 $singles =~ s/:/,/g; # replacing colons (:) with commas | |
8323 } | |
8324 else{ | |
8325 $singles = join (',',@ARGV); | |
8326 unless ($singles){ | |
8327 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n"; | |
8328 } | |
8329 $singles =~ s/\s/,/g; # replacing spaces with commas | |
8330 } | |
8331 | |
8332 @filenames = (split(/,/,$singles)); | |
8333 warn "\nFiles to be analysed:\n"; | |
8334 warn "@filenames\n\n"; | |
8335 sleep (3); | |
8336 } | |
8337 | |
8338 ### MININUM INSERT SIZE (PAIRED-END ONLY) | |
8339 if (defined $minins){ | |
8340 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles); | |
8341 push @bowtie_options,"--minins $minins"; | |
8342 } | |
8343 | |
8344 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY) | |
8345 if (defined $maxins){ | |
8346 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles); | |
8347 push @bowtie_options,"--maxins $maxins"; | |
8348 } | |
8349 else{ | |
8350 unless ($singles){ | |
8351 push @bowtie_options,'--maxins 500'; | |
8352 } | |
8353 } | |
8354 | |
8355 ### QUIET prints nothing besides alignments (suppresses warnings) | |
8356 if ($quiet){ | |
8357 push @bowtie_options,'--quiet'; | |
8358 } | |
8359 | |
8360 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments | |
8361 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option | |
8362 if (defined $chunk){ | |
8363 push @bowtie_options,"--chunkmbs $chunk"; | |
8364 } | |
8365 else{ | |
8366 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default) | |
8367 } | |
8368 } | |
8369 | |
8370 | |
8371 ### SUMMARY OF ALL BOWTIE OPTIONS | |
8372 my $bowtie_options = join (' ',@bowtie_options); | |
8373 | |
8374 | |
8375 ### STRAND-SPECIFIC LIBRARIES | |
8376 my $directional; | |
8377 if ($non_directional){ | |
8378 die "A library can only be specified to be either non-directional or a PBAT-Seq library. Please respecify!\n\n" if ($pbat); | |
8379 warn "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported\n"; | |
8380 sleep (1); | |
8381 $directional = 0; | |
8382 } | |
8383 elsif($pbat){ | |
8384 die "The option --pbat is currently not compatible with --gzip. Please run alignments with uncompressed temporary files, i.e. lose the option --gzip\n" if ($gzip); | |
8385 die "The option --pbat is currently only working with FastQ files. Please respecify (i.e. lose the option -f)!\n" if ($fasta); | |
8386 | |
8387 warn "Library was specified as PBAT-Seq (Post-Bisulfite Adapter Tagging), only performing alignments to the complementary strands (CTOT and CTOB)\n"; | |
8388 sleep (1); | |
8389 $directional = 0; | |
8390 } | |
8391 else{ | |
8392 warn "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!)\n"; | |
8393 sleep (1); | |
8394 $directional = 1; # default behaviour | |
8395 } | |
8396 | |
8397 ### UNMAPPEDSEQUENCE OUTPUT | |
8398 $unmapped = 0 unless ($unmapped); | |
8399 | |
8400 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT | |
8401 $multi_map = 0 unless ($multi_map); | |
8402 | |
8403 | |
8404 ### OUTPUT DIRECTORY | |
8405 | |
8406 chdir $parent_dir or die "Failed to move back to current working directory\n"; | |
8407 if ($output_dir){ | |
8408 unless ($output_dir =~ /\/$/){ | |
8409 $output_dir =~ s/$/\//; | |
8410 } | |
8411 | |
8412 if (chdir $output_dir){ | |
8413 $output_dir = getcwd; # making the path absolute | |
8414 unless ($output_dir =~ /\/$/){ | |
8415 $output_dir =~ s/$/\//; | |
8416 } | |
8417 } | |
8418 else{ | |
8419 mkdir $output_dir or die "Unable to create directory $output_dir $!\n"; | |
8420 warn "Created output directory $output_dir!\n\n"; | |
8421 chdir $output_dir or die "Failed to move to $output_dir\n"; | |
8422 $output_dir = getcwd; # making the path absolute | |
8423 unless ($output_dir =~ /\/$/){ | |
8424 $output_dir =~ s/$/\//; | |
8425 } | |
8426 } | |
8427 warn "Output will be written into the directory: $output_dir\n"; | |
8428 } | |
8429 else{ | |
8430 $output_dir = ''; | |
8431 } | |
8432 | |
8433 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files | |
8434 | |
8435 chdir $parent_dir or die "Failed to move back to current working directory\n"; | |
8436 if ($temp_dir){ | |
8437 warn "\nUsing temp directory: $temp_dir\n"; | |
8438 unless ($temp_dir =~ /\/$/){ | |
8439 $temp_dir =~ s/$/\//; | |
8440 } | |
8441 | |
8442 if (chdir $temp_dir){ | |
8443 $temp_dir = getcwd; # making the path absolute | |
8444 unless ($temp_dir =~ /\/$/){ | |
8445 $temp_dir =~ s/$/\//; | |
8446 } | |
8447 } | |
8448 else{ | |
8449 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n"; | |
8450 warn "Created temporary directory $temp_dir!\n\n"; | |
8451 chdir $temp_dir or die "Failed to move to $temp_dir\n"; | |
8452 $temp_dir = getcwd; # making the path absolute | |
8453 unless ($temp_dir =~ /\/$/){ | |
8454 $temp_dir =~ s/$/\//; | |
8455 } | |
8456 } | |
8457 warn "Temporary files will be written into the directory: $temp_dir\n"; | |
8458 } | |
8459 else{ | |
8460 $temp_dir = ''; | |
8461 } | |
8462 | |
8463 ### OPTIONAL NON-BS MISMATCH OUTPUT AS EXTRA COLUMN IN SAM FILE | |
8464 if ($non_bs_mm){ | |
8465 if ($vanilla){ | |
8466 die "Option '--non_bs_mm' may only be specified for output in SAM format. Please respecify!\n"; | |
8467 } | |
8468 } | |
8469 | |
8470 ### PREFIX FOR OUTPUT FILES | |
8471 if ($prefix){ | |
8472 # removing trailing dots | |
8473 | |
8474 $prefix =~ s/\.+$//; | |
8475 | |
8476 warn "Using the following prefix for output files: $prefix\n\n"; | |
8477 sleep(1); | |
8478 } | |
8479 | |
8480 if (defined $multicore){ | |
8481 unless ($multicore > 0){ | |
8482 die "Core usage needs to be set to 1 or more (currently selected $multicore). Please respecify!\n"; | |
8483 } | |
8484 if ($multicore > 20){ | |
8485 warn "Core usage currently set to more than 20 threads. This might fail horribly but let's see how it goes... (set value: $multicore)\n\n"; | |
8486 } | |
8487 if ($sam){ | |
8488 die "The multicore function currently requires the output to be in BAM format, so please lose either option --sam or --multi\n"; | |
8489 } | |
8490 } | |
8491 else{ | |
8492 $multicore = 1; # default. Single-thread mode | |
8493 warn "Setting parallelization to single-threaded (default)\n\n"; | |
8494 } | |
8495 | |
8496 if ($basename and $multicore > 1){ | |
8497 die "Specifying --basename in conjuction with --multicore is currently not supported (but we are aiming to fix this soon). Please lose either --basename or --multicore to proceed\n\n"; | |
8498 } | |
8499 | |
8500 # Read Group Tags for the @RG header | |
8501 if (defined $rg_sample){ | |
8502 if (defined $rg_id){ | |
8503 warn "--rg_id set to '$rg_id', setting --rg_tag to TRUE\n"; | |
8504 $rg_tag++; # implicitely setting $rg_tag as well | |
8505 } | |
8506 else{ | |
8507 die "--rg_sample cannot be specified without without setting --rg_id. Please set both or none (which would result in the default name 'SAMPLE' for both)\n"; | |
8508 } | |
8509 } | |
8510 | |
8511 if ($rg_tag){ # either true because of --rg_tag, or because --rg_id/--rg_sample were defined as well | |
8512 unless (defined $rg_id){ | |
8513 $rg_id = 'SAMPLE'; | |
8514 } | |
8515 unless (defined $rg_sample){ | |
8516 $rg_sample = 'SAMPLE'; | |
8517 } | |
8518 } | |
8519 | |
8520 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index_present,$multicore,$rg_tag,$rg_id,$rg_sample,$ambig_bam,$cram,$cram_ref,$nucleotide_coverage,$dovetail); | |
8521 } | |
8522 | |
8523 | |
8524 | |
8525 sub generate_SAM_header{ | |
8526 | |
8527 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order | |
8528 if ($ambig_bam){ | |
8529 print AMBIBAM "\@HD\tVN:1.0\tSO:unsorted\n"; | |
8530 } | |
8531 | |
8532 # Unordered printing of @SQ headers | |
8533 # foreach my $chr (keys %chromosomes){ | |
8534 # my $length = length ($chromosomes{$chr}); | |
8535 # print "\@SQ\tSN:$chr\tLN:$length\n"; | |
8536 # print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length | |
8537 # } | |
8538 | |
8539 foreach my $chr (sort {$a<=>$b} keys %SQ_order){ | |
8540 # warn "$chr\t$SQ_order{$chr}\n"; | |
8541 my $length = length ($chromosomes{$SQ_order{$chr}}); | |
8542 print OUT "\@SQ\tSN:$SQ_order{$chr}\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length | |
8543 if ($ambig_bam){ | |
8544 print AMBIBAM "\@SQ\tSN:$SQ_order{$chr}\tLN:$length\n"; | |
8545 } | |
8546 } | |
8547 | |
8548 # 18 11 2015: Added @RG as a header line if --rg_tag or --rg_id/--rg_sample were set as well | |
8549 if ($rg_tag){ | |
8550 print OUT "\@RG\tPL:ILLUMINA\tID:$rg_id\tSM:$rg_sample\n"; # @RG = Read Group, PL = Platform, ID: required, SM: sample, can be a description | |
8551 } | |
8552 | |
8553 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version | |
8554 if ($ambig_bam){ | |
8555 print AMBIBAM "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; | |
8556 } | |
8557 } | |
8558 | |
8559 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format: | |
8560 ### O. Tam (2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011), A. Dei Rossi (2014) | |
8561 | |
8562 sub single_end_SAM_output{ | |
8563 | |
8564 my ($id,$actual_seq,$methylation_call_params,$qual) = @_; | |
8565 my $strand = $methylation_call_params->{$id}->{alignment_strand}; | |
8566 my $chr = $methylation_call_params->{$id}->{chromosome}; | |
8567 my $start = $methylation_call_params->{$id}->{position}; | |
8568 my $stop = $methylation_call_params->{$id}->{end_position}; | |
8569 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence}; | |
8570 my $methcall = $methylation_call_params->{$id}->{methylation_call}; | |
8571 my $read_conversion = $methylation_call_params->{$id}->{read_conversion}; | |
8572 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; | |
8573 my $number_of_mismatches; | |
8574 | |
8575 if ($bowtie2){ | |
8576 $number_of_mismatches= $methylation_call_params->{$id}->{alignment_score}; | |
8577 } | |
8578 else{ | |
8579 $number_of_mismatches= $methylation_call_params->{$id}->{number_of_mismatches}; | |
8580 } | |
8581 | |
8582 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" | |
8583 ## FLAG: bitwise FLAG. Each bit is explained in the following table: | |
8584 ## Bit Description Comment Value | |
8585 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1) | |
8586 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2) | |
8587 ## 0x4 segment unmapped --- --- | |
8588 ## 0x8 next segment in the template unmapped --- --- | |
8589 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16) | |
8590 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32) | |
8591 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64) | |
8592 ## 0x80 the last segment in the template read 2 value: 2**7 (128) | |
8593 ## 0x100 secondary alignment --- --- | |
8594 ## 0x200 not passing quality controls --- --- | |
8595 ## 0x400 PCR or optical duplicate --- --- | |
8596 | |
8597 ##### | |
8598 | |
8599 my $flag; # FLAG variable used for SAM format. | |
8600 if ($strand eq "+"){ | |
8601 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ | |
8602 $flag = 0; # 0 for "+" strand (OT) | |
8603 } | |
8604 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ | |
8605 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand) | |
8606 } | |
8607 else{ | |
8608 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; | |
8609 } | |
8610 } | |
8611 elsif ($strand eq "-"){ | |
8612 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ | |
8613 $flag = 16; # 16 for "-" strand (OB) | |
8614 } | |
8615 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ | |
8616 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand) | |
8617 } | |
8618 else{ | |
8619 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n"; | |
8620 } | |
8621 } | |
8622 else{ | |
8623 die "Unexpected strand information: $strand\n\n"; | |
8624 } | |
8625 | |
8626 ##### | |
8627 | |
8628 my $mapq; | |
8629 | |
8630 if ($bowtie2){ | |
8631 $mapq = $methylation_call_params->{$id}->{mapq}; | |
8632 } | |
8633 else{ | |
8634 $mapq = 255; # Mapping quality is unavailable for use with Bowtie | |
8635 } | |
8636 | |
8637 ##### | |
8638 | |
8639 my $cigar; | |
8640 if ($bowtie2){ | |
8641 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2 | |
8642 } | |
8643 else{ | |
8644 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches) | |
8645 } | |
8646 | |
8647 ##### | |
8648 | |
8649 my $rnext = "*"; # Paired-end variable | |
8650 | |
8651 ##### | |
8652 | |
8653 my $pnext = 0; # Paired-end variable | |
8654 | |
8655 ##### | |
8656 | |
8657 my $tlen = 0; # Paired-end variable | |
8658 | |
8659 ##### | |
8660 | |
8661 if ($read_conversion eq 'CT'){ | |
8662 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands | |
8663 } | |
8664 else{ | |
8665 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries | |
8666 } | |
8667 | |
8668 if ($strand eq '-'){ | |
8669 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand | |
8670 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence | |
8671 if ($cigar =~ /D/){ | |
8672 $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag} ); | |
8673 } | |
8674 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well | |
8675 } | |
8676 | |
8677 ##### | |
8678 | |
8679 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string | |
8680 # into the reference string. hemming_dist() | |
8681 if ($bowtie2){ | |
8682 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence | |
8683 } | |
8684 | |
8685 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences | |
8686 | |
8687 ##### | |
8688 | |
8689 my $MD_tag = make_mismatch_string($actual_seq, $ref_seq,$cigar,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag}); # Optional tag MD: string providing mismatched reference bases in the alignment (this does include indel information) | |
8690 # my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!) | |
8691 | |
8692 ##### | |
8693 | |
8694 my $XM_tag; # Optional tag XM: Methylation Call String | |
8695 if ($strand eq '+'){ | |
8696 $XM_tag = "XM:Z:$methcall"; | |
8697 } | |
8698 elsif ($strand eq '-'){ | |
8699 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well | |
8700 } | |
8701 | |
8702 ##### | |
8703 | |
8704 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion | |
8705 | |
8706 ##### | |
8707 | |
8708 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion | |
8709 | |
8710 ##### | |
8711 | |
8712 # Optionally calculating number of mismatches for Bowtie 2 alignments | |
8713 | |
8714 if ($non_bs_mm) { | |
8715 if ($bowtie2) { | |
8716 | |
8717 $number_of_mismatches =~ s/-//; # removing the minus sign | |
8718 | |
8719 ### if Bowtie 2 was used we need to analyse the CIGAR string whether the read contained any indels to determine the number of mismatches | |
8720 if ($cigar =~ /(D|I)/) { | |
8721 # warn "$cigar\n"; | |
8722 | |
8723 # parsing CIGAR string | |
8724 my @len = split (/\D+/,$cigar); # storing the length per operation | |
8725 my @ops = split (/\d+/,$cigar); # storing the operation | |
8726 shift @ops; # remove the empty first element | |
8727 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
8728 | |
8729 foreach (0..$#len) { | |
8730 if ($ops[$_] eq 'M') { | |
8731 # warn "skipping\n"; | |
8732 next; # irrelevant | |
8733 } | |
8734 elsif ($ops[$_] eq 'I') { # insertion in the read sequence | |
8735 $number_of_mismatches -= $insertion_open; | |
8736 $number_of_mismatches -= $len[$_] * $insertion_extend; | |
8737 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; | |
8738 } | |
8739 elsif ($ops[$_] eq 'D') { # deletion in the read sequence | |
8740 $number_of_mismatches -= $deletion_open; | |
8741 $number_of_mismatches -= $len[$_] * $deletion_extend; | |
8742 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; | |
8743 } | |
8744 elsif ($cigar =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die | |
8745 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
8746 } | |
8747 else { | |
8748 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n"; | |
8749 } | |
8750 } | |
8751 # warn "Alignment score $number_of_mismatches\n"; | |
8752 # print "Mismatches $number_of_mismatches\n\n"; | |
8753 } | |
8754 ### Now we have InDel corrected alignment scores | |
8755 | |
8756 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the | |
8757 ### sequence contained more than 5 Ns, but this should occur close to never | |
8758 | |
8759 my $seq_N_count = $number_of_mismatches % 6; # modulo 6 will return the integer rest after the division | |
8760 # warn "N count: $seq_N_count\n"; | |
8761 $number_of_mismatches = int ($number_of_mismatches / 6) + $seq_N_count; | |
8762 # warn "MM $number_of_mismatches\n"; | |
8763 } | |
8764 } | |
8765 | |
8766 #### | |
8767 | |
8768 my $XA_tag = "XA:Z:$number_of_mismatches"; | |
8769 | |
8770 #### | |
8771 | |
8772 my $read_group; # optional | |
8773 if ($rg_tag){ | |
8774 $read_group = "RG:Z:$rg_id"; | |
8775 } | |
8776 | |
8777 #### | |
8778 | |
8779 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields | |
8780 ### optionally print number of non-bisulfite mismatches | |
8781 if ($non_bs_mm){ | |
8782 if ($rg_tag){ | |
8783 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag,$read_group)),"\n"; | |
8784 } | |
8785 else{ | |
8786 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag,$XA_tag)),"\n"; | |
8787 } | |
8788 } | |
8789 else{ # default | |
8790 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields | |
8791 if ($rg_tag){ | |
8792 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag,$read_group)),"\n"; | |
8793 } | |
8794 else{ | |
8795 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$MD_tag,$XM_tag,$XR_tag,$XG_tag)),"\n"; | |
8796 } | |
8797 } | |
8798 } | |
8799 | |
8800 sub paired_end_SAM_output{ | |
8801 | |
8802 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_; | |
8803 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand | |
8804 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2}; | |
8805 my $chr = $methylation_call_params->{$id}->{chromosome}; | |
8806 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1}; | |
8807 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2}; | |
8808 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1}; | |
8809 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2}; | |
8810 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1}; | |
8811 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2}; | |
8812 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion}; | |
8813 | |
8814 my $id_1; | |
8815 my $id_2; | |
8816 | |
8817 if ($old_flag){ | |
8818 $id_1 = $id.'/1'; | |
8819 $id_2 = $id.'/2'; | |
8820 } | |
8821 else{ | |
8822 $id_1 = $id; # appending /1 or /2 confuses some downstream programs such as Picard | |
8823 $id_2 = $id; | |
8824 } | |
8825 | |
8826 # Allows all degenerate nucleotide sequences in reference genome | |
8827 # die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHVX]/i; # X are padded nucleotides in case of insertions in the read | |
8828 # die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHVX]/i; | |
8829 | |
8830 my $index; # used to store the srand origin of the alignment in a less convoluted way | |
8831 | |
8832 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){ | |
8833 $index = 0; ## this is OT (original top strand) | |
8834 } | |
8835 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){ | |
8836 $index = 1; ## this is CTOB (complementary to OB) | |
8837 } | |
8838 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){ | |
8839 $index = 2; ## this is CTOT (complementary to OT) | |
8840 } | |
8841 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){ | |
8842 $index = 3; ## this is OB (original bottom) | |
8843 } | |
8844 else { | |
8845 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n"; | |
8846 } | |
8847 | |
8848 my $number_of_mismatches_1; | |
8849 my $number_of_mismatches_2; | |
8850 | |
8851 if ($bowtie2){ # Bowtie 2 reports always as read 1 then read 2, so this is fine | |
8852 $number_of_mismatches_1 = $methylation_call_params->{$id}->{alignment_score_1}; # only needed for custom allele-specific output, not the default! | |
8853 $number_of_mismatches_2 = $methylation_call_params->{$id}->{alignment_score_2}; | |
8854 } | |
8855 else{ # Bowtie 1 reports always the leftmost read first. That means we have to reverse the strings if the first read aligned in reverse orientation | |
8856 if ($index == 2 or $index == 3){ # CTOT or OB | |
8857 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_2}; # only needed for custom allele-specific output, not the default! | |
8858 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_1}; | |
8859 } | |
8860 else{ # if the first read aligned in forward direction it is like for Bowtie 2 | |
8861 $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default! | |
8862 $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2}; | |
8863 } | |
8864 } | |
8865 | |
8866 | |
8867 | |
8868 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the | |
8869 ### first or last position. | |
8870 | |
8871 if ($index == 0 or $index == 3){ # OT or OB | |
8872 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2); | |
8873 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2); | |
8874 } | |
8875 else{ # CTOT or CTOB | |
8876 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2); | |
8877 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2); | |
8878 } | |
8879 | |
8880 ##### | |
8881 | |
8882 my $start_read_1; | |
8883 my $start_read_2; | |
8884 # adjusting end positions | |
8885 | |
8886 if ($bowtie2){ | |
8887 $start_read_1 = $methylation_call_params->{$id}->{position_1}; | |
8888 $start_read_2 = $methylation_call_params->{$id}->{position_2}; | |
8889 } | |
8890 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 | |
8891 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand | |
8892 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1}; | |
8893 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1; | |
8894 } | |
8895 else{ # read 1 is on the - strand | |
8896 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1; | |
8897 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1}; | |
8898 } | |
8899 } | |
8900 | |
8901 ##### | |
8902 | |
8903 my $end_read_1; | |
8904 my $end_read_2; | |
8905 # adjusting end positions | |
8906 | |
8907 if ($bowtie2){ | |
8908 $end_read_1 = $methylation_call_params->{$id}->{end_position_1}; | |
8909 $end_read_2 = $methylation_call_params->{$id}->{end_position_2}; | |
8910 } | |
8911 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1 | |
8912 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand | |
8913 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1; | |
8914 $end_read_2 = $methylation_call_params->{$id}->{alignment_end}; | |
8915 } | |
8916 else{ | |
8917 $end_read_1 = $methylation_call_params->{$id}->{alignment_end}; | |
8918 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1; | |
8919 } | |
8920 } | |
8921 | |
8922 ##### | |
8923 | |
8924 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011" | |
8925 ## FLAG: bitwise FLAG. Each bit is explained in the following table: | |
8926 ## Bit Description Comment Value | |
8927 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1) | |
8928 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2) | |
8929 ## 0x4 segment unmapped --- --- | |
8930 ## 0x8 next segment in the template unmapped --- --- | |
8931 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16) | |
8932 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32) | |
8933 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64) | |
8934 ## 0x80 the last segment in the template read 2 value: 2^^7 (128) | |
8935 ## 0x100 secondary alignment --- --- | |
8936 ## 0x200 not passing quality controls --- --- | |
8937 ## 0x400 PCR or optical duplicate --- --- | |
8938 | |
8939 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account | |
8940 | |
8941 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand) | |
8942 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences) | |
8943 | |
8944 my $flag_1; # FLAG variable used for SAM format | |
8945 my $flag_2; | |
8946 | |
8947 ### The new default FLAG values were changed on 21 07 2015, so that reads do not ignored as discordant reads by the new SeqMonk BAM import | |
8948 ### In essence we are going to flip the R1 R2 flags around for CTOT and CTOB reads. We still report the first and second read in the same | |
8949 ### order and only change the actual FLAG value. This should not affect the methylation extraction in any way | |
8950 | |
8951 if ($index == 0){ # OT | |
8952 unless ($old_flag){ | |
8953 $flag_1 = 99; # Read 1 is on the + strand and Read 2 is reversed (1+2+32+64) | |
8954 $flag_2 = 147; # Read 2 is reverse complemented but informative for the OT (1+2+16+128) | |
8955 } | |
8956 else{ | |
8957 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it) | |
8958 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128) | |
8959 } | |
8960 } | |
8961 elsif ($index == 1){ # CTOB | |
8962 unless($old_flag){ | |
8963 $flag_1 = 163; # Read 1 is on the forward strand (CTOB) and Read 2 is reverse complemented but we swap round the FLAG | |
8964 # for R1 and R2 so that we don't end up with discordant pairs | |
8965 # So Read 1 gets Paired read, mapped in proper pair, mate is reversed and second in pair (1+2+32+128) | |
8966 $flag_2 = 83; # Read 2 gets Read paired, mapped in proper pair, first in pair and Read 2 is reversed (1+2+16+64) | |
8967 } | |
8968 else{ | |
8969 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64) | |
8970 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128) | |
8971 } | |
8972 } | |
8973 elsif ($index == 2){ # CTOT | |
8974 unless ($old_flag){ | |
8975 $flag_1 = 147; # Read 1 is reverse complemented (CTOT) and Read 2 is the forward read | |
8976 # but we swap round the FLAG for R1 and R2 so that we do not end up with discordant pairs | |
8977 # So Read 1 gets Read paired, read mapped in proper pair, read reverse complemented and second in pair (1+2+32+128) | |
8978 $flag_2 = 99; # Read 2 gets Read paired, read mapped in proper pair, mate reverse strand and First in Pair (1+2+32+64) | |
8979 } | |
8980 else{ | |
8981 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64) | |
8982 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128) | |
8983 } | |
8984 } | |
8985 elsif ($index == 3){ # OB | |
8986 unless ($old_flag){ | |
8987 $flag_1 = 83; # Read 1 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+16+64) | |
8988 $flag_2 = 163; # Read 2 is on the - strand, mapped in proper pair and Read 1 is reversed (1+2+32+128) | |
8989 } | |
8990 else{ | |
8991 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64) | |
8992 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128) | |
8993 } | |
8994 } | |
8995 | |
8996 ##### | |
8997 | |
8998 my $mapq; | |
8999 | |
9000 if ($bowtie2){ | |
9001 $mapq = $methylation_call_params->{$id}->{mapq}; | |
9002 } | |
9003 else{ | |
9004 $mapq = 255; # Mapping quality is unavailable for use with Bowtie | |
9005 } | |
9006 | |
9007 ##### | |
9008 | |
9009 my $cigar_1; | |
9010 my $cigar_2; | |
9011 | |
9012 if ($bowtie2){ | |
9013 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2 | |
9014 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2}; | |
9015 } | |
9016 else{ | |
9017 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches) | |
9018 $cigar_2 = length($actual_seq_2) . "M"; | |
9019 } | |
9020 | |
9021 ##### | |
9022 | |
9023 my $rnext = '='; # Chromosome of mate; applies to both reads | |
9024 | |
9025 ##### | |
9026 | |
9027 my $pnext_1 = $start_read_2; # Leftmost position of mate | |
9028 my $pnext_2 = $start_read_1; | |
9029 | |
9030 ##### | |
9031 | |
9032 my $tlen_1; # signed observed Template LENgth (or inferred fragment size) | |
9033 my $tlen_2; | |
9034 | |
9035 if ($bowtie2){ | |
9036 | |
9037 if ($start_read_1 <= $start_read_2){ | |
9038 | |
9039 # Read 1 alignment is leftmost | |
9040 | |
9041 if ($end_read_2 >= $end_read_1){ | |
9042 | |
9043 if ($flag_1 == 83 and $dovetail){ # R1 has a reverse orientation | |
9044 # -----------------> read 2 reads are dovetailing, that is one mate alignment extends past the beginning of the other | |
9045 # <------------------- read 1 such that the wrong mate begins upstream | |
9046 # warn "FLAG 1: $flag_1\nFLAG 2: $flag_2\n"; | |
9047 # warn "Reads are dovetailing\n"; | |
9048 $tlen_1 = $start_read_1 - $end_read_2 - 1; # Read 1 still receives a - sign even though it is the leftmost one | |
9049 $tlen_2 = $end_read_2 - $start_read_1 + 1; # Read 2 receives a + sign, | |
9050 # warn "TLEN 1: $tlen_1\nTLEN 2: $tlen_2\n"; | |
9051 } | |
9052 else{ | |
9053 # -------------> read 1 reads not overlapping | |
9054 # <---------- read 2 | |
9055 # or | |
9056 # -------------------> read 1 reads overlapping | |
9057 # <------------------- read 2 | |
9058 # or | |
9059 # -------------------------> read 1 | |
9060 # <----------------------- read 2 read 2 contained within read 1 | |
9061 # or | |
9062 # -------------------------> read 1 reads 1 and 2 exactly overlapping | |
9063 # <------------------------- read 2 | |
9064 # | |
9065 | |
9066 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, | |
9067 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign | |
9068 # warn "Reads are non/overlapping\nTLEN 1: $tlen_1\nTLEN 2: $tlen_2\n"; | |
9069 } | |
9070 } | |
9071 elsif ($end_read_2 < $end_read_1){ | |
9072 | |
9073 # -------------------------> read 1 | |
9074 # <----------- read 2 read 2 contained within read 1 | |
9075 # | |
9076 # or | |
9077 # | |
9078 # -------------------------> read 1 | |
9079 # <------------------------ read 2 read 2 contained within read 1 | |
9080 | |
9081 # start and end of read 2 are fully contained within read 1, using the length of read 1 for the TLEN variable | |
9082 $tlen_1 = $end_read_1 - $start_read_1 + 1; # Set to length of read 1 Leftmost read has a + sign, | |
9083 $tlen_2 = ($end_read_1 - $start_read_1 + 1) * -1; # Set to length of read 1 Rightmost read has a - sign. well this is debatable. Changed this | |
9084 ### as a request by frozenlyse on SeqAnswers on 24 July 2013 | |
9085 } | |
9086 | |
9087 } | |
9088 | |
9089 elsif ($start_read_2 < $start_read_1){ | |
9090 | |
9091 # Read 2 alignment is leftmost | |
9092 | |
9093 if ($end_read_1 >= $end_read_2){ | |
9094 | |
9095 # Read 2 alignment is leftmost | |
9096 if ($flag_1 == 99 and $dovetail){ # R1 has a forward orientation | |
9097 | |
9098 # -----------------> read 1 reads are dovetailing, that is one mate alignment extends past the beginning of the other | |
9099 # <------------------- read 2 such that the wrong mate begins upstream | |
9100 | |
9101 # warn "FLAG 1: $flag_1\nFLAG 2: $flag_2\n"; | |
9102 # warn "Reads are dovetailing\n"; | |
9103 $tlen_1 = $end_read_1 - $start_read_2 + 1; # Read 1 still receives a + sign even though it is not leftmost | |
9104 $tlen_2 = $start_read_2 - $end_read_1 - 1; | |
9105 # warn "TLEN 1: $tlen_1\nTLEN 2: $tlen_2\n"; | |
9106 } | |
9107 else{ | |
9108 # -------------> read 2 reads not overlapping | |
9109 # <---------- read 1 | |
9110 # or | |
9111 # -------------------------> read 2 reads overlapping | |
9112 # <------------------------- read 1 | |
9113 # or | |
9114 # -------------------------> read 2 | |
9115 # <----------------------- read 1 read 1 contained within read 2 | |
9116 # or | |
9117 # -------------------------> read 2 | |
9118 # <----------------------- read 1 read 1 contained within read 2 | |
9119 # warn "FLAG 1: $flag_1\nFLAG 2: $flag_2\n"; | |
9120 # warn "Read 2 has a forward orientation\n"; | |
9121 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, | |
9122 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign | |
9123 } | |
9124 } | |
9125 elsif ($end_read_1 < $end_read_2){ | |
9126 | |
9127 # -------------------------> read 2 | |
9128 # <----------- read 1 read 1 contained within read 2 | |
9129 # | |
9130 # or | |
9131 # | |
9132 # -------------------------> read 2 | |
9133 # <------------------------ read 1 read 1 contained within read 2 | |
9134 | |
9135 # start and end of read 1 are fully contained within read 2, using the length of read 2 for the TLEN variable | |
9136 $tlen_1 = ($end_read_2 - $start_read_2 + 1) * -1; # Set to length of read 2 Shorter read receives a - sign, | |
9137 $tlen_2 = $end_read_2 - $start_read_2 + 1; # Set to length of read 2 Longer read receives a +. Well this is debatable. Changed this | |
9138 ### as a request by frozenlyse on SeqAnswers on 24 July 2013 | |
9139 } | |
9140 } | |
9141 } | |
9142 | |
9143 else{ # Bowtie 1 | |
9144 | |
9145 if ($end_read_2 >= $end_read_1){ | |
9146 # Read 1 alignment is leftmost | |
9147 # -------------------------> read 1 | |
9148 # <------------------------- read 2 | |
9149 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing | |
9150 | |
9151 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign, | |
9152 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign | |
9153 } | |
9154 else{ | |
9155 # Read 2 alignment is leftmost | |
9156 # -------------------------> read 2 | |
9157 # <------------------------- read 1 | |
9158 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing | |
9159 | |
9160 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign, | |
9161 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign | |
9162 } | |
9163 } | |
9164 | |
9165 ##### | |
9166 | |
9167 # adjusting the strand of the sequence before we use them to generate mismatch strings | |
9168 if ($strand_1 eq '-'){ | |
9169 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand | |
9170 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence | |
9171 if ($cigar_1 =~ /D/){ | |
9172 $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1} ); | |
9173 } | |
9174 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well | |
9175 } | |
9176 if ($strand_2 eq '-'){ | |
9177 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand | |
9178 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence | |
9179 if ($cigar_2 =~ /D/){ | |
9180 $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} = revcomp( $methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2} ); | |
9181 } | |
9182 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well | |
9183 } | |
9184 | |
9185 # print "$actual_seq_1\n$ref_seq_1\n\n"; | |
9186 # print "$actual_seq_2\n$ref_seq_2\n\n"; | |
9187 | |
9188 ##### | |
9189 | |
9190 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence | |
9191 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2); | |
9192 if ($bowtie2){ | |
9193 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence | |
9194 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence | |
9195 } | |
9196 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences | |
9197 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences | |
9198 | |
9199 ##### | |
9200 | |
9201 my $MD_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1,$cigar_1,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_1}); # Optional tag MD: String providing mismatched reference bases in the alignment (including indel information) | |
9202 my $MD_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2,$cigar_2,$methylation_call_params->{$id}->{genomic_seq_for_MD_tag_2}); | |
9203 | |
9204 # my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!) | |
9205 # my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2); | |
9206 | |
9207 ##### | |
9208 | |
9209 my $XM_tag_1; # Optional tag XM: Methylation call string | |
9210 my $XM_tag_2; | |
9211 | |
9212 if ($strand_1 eq '-'){ | |
9213 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented | |
9214 } | |
9215 else{ | |
9216 $XM_tag_1 = "XM:Z:$methcall_1"; | |
9217 } | |
9218 | |
9219 if ($strand_2 eq '-'){ | |
9220 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented | |
9221 } | |
9222 else{ | |
9223 $XM_tag_2 = "XM:Z:$methcall_2"; | |
9224 } | |
9225 | |
9226 ##### | |
9227 | |
9228 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state | |
9229 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state | |
9230 | |
9231 ##### | |
9232 | |
9233 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads | |
9234 | |
9235 ##### | |
9236 | |
9237 # Optionally calculating number of mismatches for Bowtie 2 alignments | |
9238 | |
9239 if ($non_bs_mm) { | |
9240 if ($bowtie2) { | |
9241 | |
9242 $number_of_mismatches_1 =~ s/-//; # removing the minus sign | |
9243 $number_of_mismatches_2 =~ s/-//; | |
9244 | |
9245 ### if Bowtie 2 was used we need to analyse the CIGAR strings whether the reads contained any indels to determine the number of mismatches | |
9246 | |
9247 ### CIGAR 1 | |
9248 if ($cigar_1 =~ /(D|I)/) { | |
9249 # warn "$cigar_1\n"; | |
9250 | |
9251 # parsing CIGAR string | |
9252 my @len = split (/\D+/,$cigar_1); # storing the length per operation | |
9253 my @ops = split (/\d+/,$cigar_1); # storing the operation | |
9254 shift @ops; # remove the empty first element | |
9255 die "CIGAR string '$cigar_1' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
9256 | |
9257 foreach (0..$#len) { | |
9258 if ($ops[$_] eq 'M') { | |
9259 # warn "skipping\n"; | |
9260 next; # irrelevant | |
9261 } | |
9262 elsif ($ops[$_] eq 'I') { # insertion in the read sequence | |
9263 $number_of_mismatches_1 -= $insertion_open; | |
9264 $number_of_mismatches_1 -= $len[$_] * $insertion_extend; | |
9265 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; | |
9266 } | |
9267 elsif ($ops[$_] eq 'D') { # deletion in the read sequence | |
9268 $number_of_mismatches_1 -= $deletion_open; | |
9269 $number_of_mismatches_1 -= $len[$_] * $deletion_extend; | |
9270 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; | |
9271 } | |
9272 elsif ($cigar_1 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die | |
9273 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
9274 } | |
9275 else { | |
9276 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n"; | |
9277 } | |
9278 } | |
9279 | |
9280 # warn "Alignment score $number_of_mismatches_1\n"; | |
9281 # print "Mismatches $number_of_mismatches_1\n\n"; | |
9282 } | |
9283 | |
9284 ### CIGAR 2 | |
9285 if ($cigar_2 =~ /(D|I)/) { | |
9286 # warn "$cigar_2\n"; | |
9287 | |
9288 # parsing CIGAR string | |
9289 my @len = split (/\D+/,$cigar_2); # storing the length per operation | |
9290 my @ops = split (/\d+/,$cigar_2); # storing the operation | |
9291 shift @ops; # remove the empty first element | |
9292 die "CIGAR string '$cigar_2' contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
9293 | |
9294 foreach (0..$#len) { | |
9295 if ($ops[$_] eq 'M') { | |
9296 # warn "skipping\n"; | |
9297 next; #irrelevant | |
9298 } | |
9299 elsif ($ops[$_] eq 'I') { # insertion in the read sequence | |
9300 $number_of_mismatches_2 -= $insertion_open; | |
9301 $number_of_mismatches_2 -= $len[$_] * $insertion_extend; | |
9302 # warn "Insertion: Subtracting $ops[$_], length $len[$_], open: $insertion_open, extend: $insertion_extend\n"; | |
9303 } | |
9304 elsif ($ops[$_] eq 'D') { # deletion in the read sequence | |
9305 $number_of_mismatches_2 -= $deletion_open; | |
9306 $number_of_mismatches_2 -= $len[$_] * $deletion_extend; | |
9307 # warn "Deletion: Subtracting $ops[$_], length $len[$_], open: $deletion_open, extend: $deletion_extend\n"; | |
9308 } | |
9309 elsif ($cigar_2 =~ tr/[NSHPX=]//) { # if these (for standard mapping) illegal characters exist we die | |
9310 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
9311 } | |
9312 else { | |
9313 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n"; | |
9314 } | |
9315 } | |
9316 } | |
9317 | |
9318 ### Now we have InDel corrected Alignment scores | |
9319 | |
9320 ### if the actual sequence contained Ns we need to adjust the number of mismatches. Ns receive a penalty of -1, but normal mismatches receive -6. This might still break if the | |
9321 ### sequence contained more than 5 Ns, but this should occur close to never | |
9322 | |
9323 my $seq_1_N_count = $number_of_mismatches_1 % 6; # modulo 6 will return the integer rest after the division | |
9324 my $seq_2_N_count = $number_of_mismatches_2 % 6; | |
9325 # warn "N count 1: $seq_1_N_count\n"; | |
9326 # warn "N count 2: $seq_2_N_count\n"; | |
9327 | |
9328 $number_of_mismatches_1 = int ($number_of_mismatches_1 / 6) + $seq_1_N_count; | |
9329 $number_of_mismatches_2 = int ($number_of_mismatches_2 / 6) + $seq_2_N_count; | |
9330 | |
9331 # warn "MM1 $number_of_mismatches_1 \n"; | |
9332 # warn "MM2 $number_of_mismatches_2 \n"; | |
9333 } | |
9334 } | |
9335 | |
9336 #### | |
9337 | |
9338 my $XA_tag = "XA:Z:$number_of_mismatches_1"; | |
9339 my $XB_tag = "XB:Z:$number_of_mismatches_2"; | |
9340 | |
9341 #### | |
9342 | |
9343 my $read_group; # optional | |
9344 if ($rg_tag){ | |
9345 $read_group = "RG:Z:$rg_id"; | |
9346 } | |
9347 | |
9348 #### | |
9349 | |
9350 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields | |
9351 ### optionally print number of non-bisulfite mismatches | |
9352 if ($non_bs_mm){ | |
9353 if ($rg_tag){ | |
9354 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag,$read_group)), "\n"; | |
9355 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag,$read_group)), "\n"; | |
9356 } | |
9357 else{ | |
9358 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$XA_tag)), "\n"; | |
9359 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$XB_tag)), "\n"; | |
9360 } | |
9361 } | |
9362 else{ # default | |
9363 if ($rg_tag){ | |
9364 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag,$read_group)), "\n"; | |
9365 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag,$read_group)), "\n"; | |
9366 } | |
9367 else{ | |
9368 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $MD_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n"; | |
9369 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $MD_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n"; | |
9370 } | |
9371 } | |
9372 } | |
9373 | |
9374 | |
9375 sub revcomp{ | |
9376 my $seq = shift or die "Missing seq to reverse complement\n"; | |
9377 $seq = reverse $seq; | |
9378 $seq =~ tr/ACTGactg/TGACTGAC/; | |
9379 return $seq; | |
9380 } | |
9381 | |
9382 sub hemming_dist{ | |
9383 my $matches = 0; | |
9384 my @actual_seq = split //,(shift @_); | |
9385 my @ref_seq = split //,(shift @_); | |
9386 | |
9387 foreach (0..$#actual_seq){ | |
9388 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]); | |
9389 } | |
9390 return my $hd = scalar @actual_seq - $matches; | |
9391 } | |
9392 | |
9393 | |
9394 ### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might | |
9395 ### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels | |
9396 ### 05 June 2014 | |
9397 | |
9398 | |
9399 sub make_mismatch_string{ | |
9400 my ($actual_seq,$ref_seq,$cigar,$md_sequence) = @_; | |
9401 | |
9402 my $MD_tag = "MD:Z:"; | |
9403 my $prev_matching = 0; | |
9404 my $last_char; | |
9405 | |
9406 my $ref_base; | |
9407 my $actual_base; | |
9408 | |
9409 foreach my $pos ( 0..(length$actual_seq) - 1 ){ | |
9410 | |
9411 $actual_base = substr($actual_seq,$pos,1); | |
9412 $ref_base = substr($ref_seq,$pos,1); | |
9413 # if ($verbose){ warn "reference: $ref_base\tseen base: $actual_base\n";} | |
9414 | |
9415 if ( $actual_base eq $ref_base ){ | |
9416 ++$prev_matching; | |
9417 } | |
9418 else{ | |
9419 # If the mismatch is due to an insertion we simply move on, else we print the previously matching bases as well as the mismatching genomic base | |
9420 if ($ref_base eq 'X'){ | |
9421 # if ($verbose){ warn "The genome base was an artificually padded '$ref_base' due to an insertion in the read at this position. Just ignoring it for the MD tag\n"; sleep(1);} | |
9422 } | |
9423 else{ | |
9424 # if ($verbose){ warn "previous matching bases: $prev_matching\n";} | |
9425 | |
9426 ### There is a mismatch between the sequence and the genome. First we need to write out how may bases matched until now | |
9427 if ($prev_matching == 0){ | |
9428 # if ($verbose){ warn "Got a mismatch either at the very start or next to another mismatch. Need to add a padding 0 as well as the mismatch\n";} | |
9429 # if ($verbose){ warn "${prev_matching}$ref_base\n";} | |
9430 $MD_tag .= $prev_matching; | |
9431 $MD_tag .= $ref_base; | |
9432 } | |
9433 else{ | |
9434 # if ($verbose){ warn "${prev_matching}$ref_base\n";} | |
9435 $MD_tag .= $prev_matching; | |
9436 $MD_tag .= $ref_base; | |
9437 } | |
9438 | |
9439 $prev_matching = 0; # resetting $prev_matching | |
9440 } | |
9441 | |
9442 } | |
9443 | |
9444 } | |
9445 ### appending the number of matches one last time | |
9446 $MD_tag .= $prev_matching; | |
9447 | |
9448 | |
9449 ### If the read contains deletion(s) we need to take care of these in the MD-tag as well | |
9450 if ($cigar =~ /D/){ | |
9451 my $deletions_total = 0; | |
9452 while ($cigar =~ /D/g){ | |
9453 ++$deletions_total; | |
9454 } | |
9455 if ($verbose){ warn "Read contains $deletions_total deletions in total\n\n";} | |
9456 | |
9457 if ($verbose){ warn "There was a deletion in the read!\n";} | |
9458 if ($verbose){ warn "actual:\t$actual_seq\nref:\t$ref_seq\nMD-seq:\t$md_sequence\nMD-tag: $MD_tag\n";} | |
9459 | |
9460 # parsing CIGAR string | |
9461 my @len = split (/\D+/,$cigar); # storing the length per operation | |
9462 my @ops = split (/\d+/,$cigar); # storing the operation | |
9463 shift @ops; # remove the empty first element | |
9464 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops); | |
9465 | |
9466 my $MD_pos_so_far = 0; | |
9467 my $deletions_processed = 0; | |
9468 my $del_pos = 0; | |
9469 my $deleted_bases = ''; | |
9470 my $new_MD = $1 if ($MD_tag =~ /MD:Z:(.*)/); | |
9471 my $md_index_already_processed; | |
9472 | |
9473 my @md = split //,$new_MD; | |
9474 | |
9475 if ($verbose){ warn "New MD-tag: $new_MD\n\n";} | |
9476 $MD_tag = "MD:Z:"; ### reconstituting a new MD-tag | |
9477 $new_MD = ''; # using this to build up a new string that will replace the old \@md | |
9478 | |
9479 if ($verbose){ warn "CIGAR string; $cigar\n";} | |
9480 ### determining end position of a read | |
9481 foreach my $index(0..$#len){ | |
9482 | |
9483 if ($ops[$index] eq 'M'){ # matching bases | |
9484 $del_pos += $len[$index]; | |
9485 if ($verbose){ warn "Operation is 'M', adding $len[$index] bp\n";} | |
9486 } | |
9487 elsif($ops[$index] eq 'I'){ # insertion | |
9488 $del_pos += $len[$index]; | |
9489 ### need to add insertions in the read to MD pos so far! | |
9490 $MD_pos_so_far += $len[$index]; | |
9491 if ($verbose){ warn "Operation is 'I', adding $len[$index] bp\n";} | |
9492 } | |
9493 elsif($ops[$index] eq 'D'){ # deletion | |
9494 if ($verbose){ warn "Operation is 'D', extracting $len[$index] bp\n";} | |
9495 $deleted_bases = substr($md_sequence,$del_pos,$len[$index]); | |
9496 if ($verbose){ warn "Deleted bases: $deleted_bases\n\n";} | |
9497 | |
9498 ### Now we need to process the MD-tag so far and write out everything up until this point, inlcuding the deletion | |
9499 if ($verbose){ warn "Now processing the MD-tag\n";} | |
9500 my $op; | |
9501 | |
9502 my $this_deletion_processed; | |
9503 my $md_processed_so_far; | |
9504 my $current_md_index; | |
9505 | |
9506 foreach my $el (@md){ | |
9507 | |
9508 unless (defined $current_md_index){ | |
9509 $current_md_index = 0; # first element = index 0 | |
9510 } | |
9511 else{ | |
9512 ++$current_md_index; | |
9513 } | |
9514 | |
9515 if ($md_index_already_processed and ($current_md_index <= $md_index_already_processed)){ | |
9516 if ($verbose){ warn "This has to be another deletion within the same read. Currently processing index $current_md_index, but have already processed $md_index_already_processed indexes previously\n";} | |
9517 $new_MD .= $el; | |
9518 next; | |
9519 } | |
9520 | |
9521 if ($verbose){ warn "Current element: $el\n";} | |
9522 unless (defined $op){ # initialize | |
9523 $op = $el; | |
9524 if ($verbose){ warn "Initializing \$op as $op\n";} | |
9525 next; | |
9526 } | |
9527 | |
9528 if ($deletions_processed == $deletions_total){ | |
9529 if ($verbose){ warn "Processed $deletions_processed in the read so far, out of $deletions_total total. Just appending elements until the end of the string: here $el\n";} | |
9530 $MD_tag .= $el; | |
9531 $new_MD .= $el; | |
9532 next; | |
9533 } | |
9534 # this only occurs when there are more deletions in the read but we want to regenerate a new MD tag | |
9535 if ($this_deletion_processed){ | |
9536 $new_MD .= $el; | |
9537 next; | |
9538 } | |
9539 | |
9540 if ($op =~ /^\d+$/){ | |
9541 if ($verbose){ warn "Operation so far was a digit: $op\n";} | |
9542 if ($el =~ /\d/){ | |
9543 $op .= $el; | |
9544 if ($verbose){ warn "Appending current operation $el. New operation is: $op\n";} | |
9545 next; | |
9546 } | |
9547 else{ | |
9548 if ($verbose){ warn "current element is a word character: $el\n";} | |
9549 | |
9550 ### Need to determine if the matching operation length includes the deletion position | |
9551 if ($verbose){ warn "Processing operation $op and adding it to MD pos which is so far: $MD_pos_so_far; deletion pos is $del_pos.\n";} | |
9552 $MD_pos_so_far += $op; | |
9553 if ($verbose){ warn "MD pos so far: $MD_pos_so_far\n";} | |
9554 if ($MD_pos_so_far < $del_pos){ | |
9555 if ($verbose){ warn "Doesn't cover the deletion yet. Writing back out.\n";} | |
9556 $MD_tag .= $op; | |
9557 $new_MD .= $op; | |
9558 if ($verbose){ warn "Setting new operation to: $el\n";} | |
9559 $op = $el; # setting new $op | |
9560 } | |
9561 else{ | |
9562 if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";} | |
9563 ### splitting up the number of matching bases in number before and after the deletion | |
9564 | |
9565 my $pos_after_deletion = $MD_pos_so_far - $del_pos; | |
9566 my $pos_before_deletion = $op - $pos_after_deletion; | |
9567 if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";} | |
9568 $MD_tag .= "${pos_before_deletion}^${deleted_bases}"; | |
9569 $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}"; | |
9570 if ($verbose){ warn "\$newMD after adjusting for the current deletion: $new_MD\n";} | |
9571 | |
9572 #adjusting the MD_position by the number of bases after the deletion | |
9573 $MD_pos_so_far -= $pos_after_deletion; | |
9574 if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; } | |
9575 ### also appending the current element because we are writing out the rest of the MD-string unchanged to $new_MD | |
9576 $new_MD .= $el; | |
9577 | |
9578 $deletions_processed += 1; | |
9579 $this_deletion_processed = 1; | |
9580 | |
9581 if ($deletions_processed == $deletions_total){ # this was the last deletion of the read | |
9582 if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";} | |
9583 $MD_tag .= "${pos_after_deletion}"; | |
9584 | |
9585 ### also appending the current element because we are writing out the rest of the MD-string unchanged | |
9586 if ($verbose){ warn "also appending the current element $el\n";} | |
9587 $MD_tag .= $el; | |
9588 ### Finally also adding the length of the deletion to $del_pos | |
9589 $del_pos += $len[$index]; | |
9590 if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} | |
9591 } | |
9592 else{ | |
9593 if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";} | |
9594 if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";} | |
9595 | |
9596 | |
9597 ### This migh need looking at!! | |
9598 | |
9599 $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op; | |
9600 if ($verbose){ warn "Current index = $current_md_index\n";} | |
9601 | |
9602 if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";} | |
9603 $md_index_already_processed = $current_md_index - 1; | |
9604 | |
9605 if ($verbose){ warn "Exiting now and waiting for the next deletion\n";} | |
9606 | |
9607 ### Finally also adding the length of the deletion to $del_pos | |
9608 $del_pos += $len[$index]; | |
9609 $MD_pos_so_far += $len[$index]; | |
9610 if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} | |
9611 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} | |
9612 #setting $op to en empty string so it is not being processed as the last element | |
9613 $op = ''; | |
9614 # last; # exiting the loop and processing the CIGAR string further until we hit the next deletion | |
9615 } | |
9616 } | |
9617 } | |
9618 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} | |
9619 } | |
9620 else{ | |
9621 if ($verbose){ warn "Operation so far was a word character: $op\n";} | |
9622 if ($el =~ /\d+/){ | |
9623 # processing the previous mismatch position | |
9624 $MD_tag .= $op; | |
9625 $new_MD .= $op; | |
9626 $MD_pos_so_far += length($op); | |
9627 if ($verbose){ warn "Writing out mismatching base $op and adding length ",length($op),"\n";} | |
9628 } | |
9629 else{ | |
9630 # this should never occur since mismatches are followed by a 0 or another digit | |
9631 die "current element is a another word character: $el. This should never happen!\n"; | |
9632 } | |
9633 if ($verbose){ warn "Setting new operation to: $el\n";} | |
9634 $op = $el; # setting new $op | |
9635 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} | |
9636 } | |
9637 } | |
9638 | |
9639 ### need to consider last element if it was a digit or number and we are expecting the deletion in the last element of the MD-tag | |
9640 if ($op =~ /\d+/ and $deletions_processed < $deletions_total){ | |
9641 if ($verbose){ warn "\n\nlast operation was $op\n";} | |
9642 if ($verbose){ warn "Processing operation $op; deletion pos is $del_pos. MD so far was: $MD_pos_so_far\n";} | |
9643 | |
9644 $MD_pos_so_far += $op; | |
9645 if ($verbose){ warn "Adding $op to MD pos so far: $MD_pos_so_far\n";} | |
9646 if ($verbose){ warn "Deletions already processed: $deletions_processed, del total: $deletions_total\n\n";} | |
9647 if ($MD_pos_so_far >= $del_pos){ | |
9648 if ($verbose){ warn "Here we go, this operation covers the deletion position!!\n";} | |
9649 ### splitting up the number of matching bases in number before and after the deletion | |
9650 | |
9651 my $pos_after_deletion = $MD_pos_so_far - $del_pos; | |
9652 my $pos_before_deletion = $op - $pos_after_deletion; | |
9653 if ($verbose){ warn "Splitting up previous operation '$op' into pos before deletion: ${pos_before_deletion} and pos_after_deletion: $pos_after_deletion\n";} | |
9654 | |
9655 $MD_tag .= "${pos_before_deletion}^${deleted_bases}"; | |
9656 $new_MD .= "${pos_before_deletion}^${deleted_bases}${pos_after_deletion}"; | |
9657 | |
9658 #adjusting the MD_position by the number of bases after the deletion | |
9659 $MD_pos_so_far -= $pos_after_deletion; | |
9660 if ($verbose){ warn "MD after adjusting for deletion: $MD_pos_so_far\n"; } | |
9661 | |
9662 $deletions_processed += 1; | |
9663 $this_deletion_processed = 1; | |
9664 | |
9665 if ($deletions_processed == $deletions_total){ # this was the last deletion of the read | |
9666 if ($verbose){ warn "This was the last deletion in the read ($deletions_processed out of $deletions_total total). Continuing to append \$pos_after_deletion (${pos_after_deletion})..\n";} | |
9667 $MD_tag .= "${pos_after_deletion}"; | |
9668 | |
9669 } | |
9670 else{ | |
9671 if ($verbose){ warn "This wasn't the last deletion in the read. Substituting the last operation with the current deletion and reconstituting \@md\n";} | |
9672 if ($verbose){ warn "Adding length of deletion string '${pos_before_deletion}^${deleted_bases}' (",length("${pos_before_deletion}^${deleted_bases}")," - length of current operation (",length$op,") to current_md_index\n";} | |
9673 | |
9674 $current_md_index = $current_md_index + length("${pos_before_deletion}^${deleted_bases}") - length$op; | |
9675 if ($verbose){ warn "Current index = $current_md_index\n";} | |
9676 | |
9677 if ($verbose){ warn "Setting \$md_index_already_processed to ",$current_md_index-1,"\n";} | |
9678 # since we are no longer in the loop we don't have to subtract 1 from $current_md_index (tit hasn't been incremented in the first place...) | |
9679 $md_index_already_processed = $current_md_index; | |
9680 | |
9681 if ($verbose){ warn "Exiting now and waiting for the next deletion\n";} | |
9682 | |
9683 $MD_pos_so_far += $len[$index]; | |
9684 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\n";} | |
9685 } | |
9686 ### Finally also adding the length of the deletion to $del_pos | |
9687 $del_pos += $len[$index]; | |
9688 if ($verbose){ warn "Adding length of the deletion itself (",$len[$index],") to \$del_pos: currently at $del_pos\n";} | |
9689 } | |
9690 else{ | |
9691 die "Something went wrong, we haven't seen a deletion so far even though we should have...\n\n"; | |
9692 } | |
9693 } | |
9694 | |
9695 # forming a new @md | |
9696 @md = split //,$new_MD; | |
9697 $new_MD = ''; | |
9698 if ($verbose){ warn "New \@md array: @md\n\n";} | |
9699 if ($verbose){ warn "MD-tag so far: $MD_tag ~~\nnew_MD so far: $new_MD\n\n";} | |
9700 | |
9701 } | |
9702 else{ | |
9703 die "Found CIGAR operations other than M, I, D or N: '$ops[$index]'. Not allowed at the moment\n"; | |
9704 } | |
9705 } | |
9706 | |
9707 } | |
9708 if ($verbose){ warn "Returning MD-tag: $MD_tag\n";} | |
9709 return $MD_tag; | |
9710 | |
9711 } | |
9712 | |
9713 ### Getting rid of the bitwise comparison because even though the initial comparison is nice and quick, the regex loop looking for non-null bytes characters isn't. We might | |
9714 ### as well do a substring loop to start with, which enables us to generate proper MD:Z: flags that also take proper care of InDels | |
9715 # sub make_mismatch_string{ | |
9716 # my $actual_seq = shift or die "Missing actual sequence\n"; | |
9717 # my $ref_seq = shift or die "Missing reference sequence\n"; | |
9718 # my $XX_tag = "XX:Z:"; | |
9719 | |
9720 # my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison | |
9721 | |
9722 # warn "'$tmp'\n"; sleep(1); | |
9723 # my $prev_mm_pos = 0; | |
9724 | |
9725 # while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference | |
9726 # my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch | |
9727 # my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read | |
9728 # $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other | |
9729 # $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation) | |
9730 # $prev_mm_pos = pos($tmp); # Position of last mismatch | |
9731 # } | |
9732 # my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence | |
9733 # $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence | |
9734 # return $XX_tag; | |
9735 # } | |
9736 | |
9737 | |
9738 | |
9739 sub print_helpfile{ | |
9740 print << "HOW_TO"; | |
9741 | |
9742 | |
9743 This program is free software: you can redistribute it and/or modify | |
9744 it under the terms of the GNU General Public License as published by | |
9745 the Free Software Foundation, either version 3 of the License, or | |
9746 (at your option) any later version. | |
9747 | |
9748 This program is distributed in the hope that it will be useful, | |
9749 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
9750 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
9751 GNU General Public License for more details. | |
9752 You should have received a copy of the GNU General Public License | |
9753 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
9754 | |
9755 | |
9756 | |
9757 DESCRIPTION | |
9758 | |
9759 | |
9760 The following is a brief description of command line options and arguments to control the Bismark | |
9761 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the | |
9762 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand | |
9763 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand). | |
9764 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome | |
9765 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the | |
9766 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2) | |
9767 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original | |
9768 sequence from the genome and determine if there were any protected C's present or not. | |
9769 | |
9770 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be | |
9771 re-enabled by using --non_directional. | |
9772 | |
9773 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old | |
9774 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best | |
9775 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below. | |
9776 | |
9777 | |
9778 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} | |
9779 | |
9780 | |
9781 ARGUMENTS: | |
9782 | |
9783 <genome_folder> The path to the folder containing the unmodified reference genome | |
9784 as well as the subfolders created by the Bismark_Genome_Preparation | |
9785 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/). | |
9786 Bismark expects one or more fastA files in this folder (file extension: .fa | |
9787 or .fasta). The path can be relative or absolute. The path may also be set as | |
9788 '--genome_folder /path/to/genome/folder/'. | |
9789 | |
9790 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes | |
9791 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must | |
9792 correspond file-for-file and read-for-read with those specified in <mates2>. | |
9793 Reads may be a mix of different lengths. Bismark will produce one mapping result | |
9794 and one report file per paired-end input file pair. | |
9795 | |
9796 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes | |
9797 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must | |
9798 correspond file-for-file and read-for-read with those specified in <mates1>. | |
9799 Reads may be a mix of different lengths. | |
9800 | |
9801 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g. | |
9802 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will | |
9803 produce one mapping result and one report file per input file. | |
9804 | |
9805 | |
9806 OPTIONS: | |
9807 | |
9808 | |
9809 Input: | |
9810 | |
9811 --se/--single_end <list> Sets single-end mapping mode explicitly giving a list of file names as <list>. | |
9812 The filenames may be provided as a comma [,] or colon [:] separated list. | |
9813 | |
9814 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ | |
9815 files (usually having extension .fg or .fastq). This is the default. See also | |
9816 --solexa-quals. | |
9817 | |
9818 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA | |
9819 files (usually having extensions .fa, .mfa, .fna or similar). All quality values | |
9820 are assumed to be 40 on the Phred scale. FASTA files are expected to contain both | |
9821 the read name and the sequence on a single line (and not spread over several lines). | |
9822 | |
9823 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input. | |
9824 | |
9825 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit. | |
9826 | |
9827 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on. | |
9828 | |
9829 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off. | |
9830 | |
9831 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled | |
9832 (which can't). The formula for conversion is: | |
9833 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This | |
9834 is usually the right option for use with (unconverted) reads emitted by the GA | |
9835 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off. | |
9836 | |
9837 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted) | |
9838 reads emitted by GA Pipeline version 1.3 or later. Default: off. | |
9839 | |
9840 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not | |
9841 specified it is assumed that Bowtie (1 or 2) is in the PATH. | |
9842 | |
9843 | |
9844 Alignment: | |
9845 | |
9846 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs | |
9847 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the | |
9848 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N). | |
9849 | |
9850 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to | |
9851 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for | |
9852 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L). | |
9853 | |
9854 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout | |
9855 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds | |
9856 quality values to the nearest 10 and saturates at 30. This value is not relevant for | |
9857 Bowtie 2. | |
9858 | |
9859 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in | |
9860 --best mode. Best-first search must keep track of many paths at once to ensure it is | |
9861 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the | |
9862 memory impact of the descriptors, but they can still grow very large in some cases. If | |
9863 you receive an error message saying that chunk memory has been exhausted in --best mode, | |
9864 try adjusting this parameter up to dedicate more memory to the descriptors. This value | |
9865 is not relevant for Bowtie 2. Default: 512. | |
9866 | |
9867 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and | |
9868 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation | |
9869 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also | |
9870 satisfied). A 19-bp gap would not be valid in that case. Default: 0. | |
9871 | |
9872 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and | |
9873 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a | |
9874 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied). | |
9875 A 61-bp gap would not be valid in that case. Default: 500. | |
9876 | |
9877 --multicore <int> Sets the number of parallel instances of Bismark to be run concurrently. This forks the | |
9878 Bismark alignment step very early on so that each individual Spawn of Bismark processes | |
9879 only every n-th sequence (n being set by --multicore). Once all processes have completed, | |
9880 the individual BAM files, mapping reports, unmapped or ambiguous FastQ files are merged | |
9881 into single files in very much the same way as they would have been generated running Bismark | |
9882 conventionally with only a single instance. | |
9883 | |
9884 If system resources are plentiful this is a viable option to speed up the alignment process | |
9885 (we observed a near linear speed increase for up to --multicore 8 tested). However, please note | |
9886 that a typical Bismark run will use several cores already (Bismark itself, 2 or 4 threads of | |
9887 Bowtie/Bowtie2, Samtools, gzip etc...) and ~10-16GB of memory depending on the choice of aligner | |
9888 and genome. WARNING: Bismark Parallel (BP?) is resource hungry! Each value of --multicore specified | |
9889 will effectively lead to a linear increase in compute and memory requirements, so --multicore 4 for | |
9890 e.g. the GRCm38 mouse genome will probably use ~20 cores and eat ~40GB or RAM, but at the same time | |
9891 reduce the alignment time to ~25-30%. You have been warned. | |
9892 | |
9893 | |
9894 | |
9895 Bowtie 1 Reporting: | |
9896 | |
9897 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option | |
9898 will be used by default. | |
9899 | |
9900 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum | |
9901 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in | |
9902 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred | |
9903 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both | |
9904 have Phred quality 10. When --best is not specified, Bowtie may report alignments that | |
9905 are sub-optimal in terms of stratum and/or quality (though an effort is made to report | |
9906 the best alignment). --best mode also removes all strand bias. Note that --best does not | |
9907 affect which alignments are considered "valid" by Bowtie, only which valid alignments | |
9908 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified. | |
9909 Default: on. | |
9910 | |
9911 --no_best Disables the --best option which is on by default. This can speed up the alignment process, | |
9912 e.g. for testing purposes, but for credible results it is not recommended to disable --best. | |
9913 | |
9914 | |
9915 Output: | |
9916 | |
9917 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four | |
9918 bisulfite strands will be reported. Default: OFF. | |
9919 | |
9920 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary | |
9921 to the original strands are merely theoretical and should not exist in reality. Specifying directional | |
9922 alignments (which is the default) will only run 2 alignment threads to the original top (OT) | |
9923 or bottom (OB) strands in parallel and report these alignments. This is the recommended option | |
9924 for sprand-specific libraries). | |
9925 | |
9926 --pbat This options may be used for PBAT-Seq libraries (Post-Bisulfite Adapter Tagging; Kobayashi et al., | |
9927 PLoS Genetics, 2012). This is essentially the exact opposite of alignments in 'directional' mode, | |
9928 as it will only launch two alignment threads to the CTOT and CTOB strands instead of the normal OT | |
9929 and OB ones. Use this option only if you are certain that your libraries were constructed following | |
9930 a PBAT protocol (if you don't know what PBAT-Seq is you should not specify this option). The option | |
9931 --pbat works only for FastQ files (in both Bowtie and Bowtie 2 mode) and using uncompressed | |
9932 temporary files only). | |
9933 | |
9934 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are | |
9935 split up into several smaller files to run concurrently and the output files are to be merged. | |
9936 | |
9937 --rg_tag Write out a Read Group tag to the resulting SAM/BAM file. This will write the following line to the | |
9938 SAM header: \@RG PL: ILLUMINA ID:SAMPLE SM:SAMPLE ; to set ID and SM see --rg_id and --rg_sample. | |
9939 In addition each read receives an RG:Z:RG-ID tag. Default: OFF. | |
9940 | |
9941 --rg_id <string> Sets the ID field in the \@RG header line. The default is 'SAMPLE'. | |
9942 | |
9943 --rg_sample <string> Sets the SM field in the \@RG header line; can't be set without setting --rg_id as well. The default is | |
9944 'SAMPLE'. | |
9945 | |
9946 --quiet Print nothing besides alignments. | |
9947 | |
9948 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead | |
9949 of SAM format output. | |
9950 | |
9951 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will | |
9952 appear as they did in the input, without any translation of quality values that may have | |
9953 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1 | |
9954 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads | |
9955 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping) | |
9956 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well. | |
9957 | |
9958 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest | |
9959 mismatches or other reads that fail to align uniquely to a file in the output directory. | |
9960 Written reads will appear as. they did in the input, without any of the translation of quality | |
9961 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two | |
9962 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and | |
9963 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un. | |
9964 | |
9965 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into | |
9966 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt | |
9967 to create it first. The path to the output folder can be either relative or absolute. | |
9968 | |
9969 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If | |
9970 the specified folder does not exist, Bismark will attempt to create it first. The path to the | |
9971 temporary folder can be either relative or absolute. | |
9972 | |
9973 --non_bs_mm Optionally outputs an extra column specifying the number of non-bisulfite mismatches a read during the | |
9974 alignment step. This option is only available for SAM format. In Bowtie 2 context, this value is | |
9975 just the number of actual non-bisulfite mismatches and ignores potential insertions or deletions. | |
9976 The format for single-end reads and read 1 of paired-end reads is 'XA:Z:number of mismatches' | |
9977 and 'XB:Z:number of mismatches' for read 2 of paired-end reads. | |
9978 | |
9979 --gzip Temporary bisulfite conversion files will be written out in a GZIP compressed form to save disk | |
9980 space. This option is available for most alignment modes but is not available for paired-end FastA | |
9981 files. This option might be somewhat slower than writing out uncompressed files, but this awaits | |
9982 further testing. | |
9983 | |
9984 --sam The output will be written out in SAM format instead of the default BAM format. Bismark will | |
9985 attempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't | |
9986 been specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found, | |
9987 the SAM output will be compressed with GZIP instead (yielding a .sam.gz output file). | |
9988 | |
9989 --cram Writes the output to a CRAM file instead of BAM. This requires the use of Samtools 1.2 or higher. | |
9990 | |
9991 --cram_ref <ref_file> CRAM output requires you to specify a reference genome as a single FastA file. If this single-FastA | |
9992 reference file is not supplied explicitly it will be regenerated from the genome .fa sequence(s) | |
9993 used for the Bismark run and written to a file called 'Bismark_genome_CRAM_reference.mfa' into the | |
9994 oputput directory. | |
9995 | |
9996 --samtools_path The path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified | |
9997 explicitly if Samtools is in the PATH already. | |
9998 | |
9999 --prefix <prefix> Prefixes <prefix> to the output filenames. Trailing dots will be replaced by a single one. For | |
10000 example, '--prefix test' with 'file.fq' would result in the output file 'test.file.fq_bismark.sam' etc. | |
10001 | |
10002 -B/--basename <basename> Write all output to files starting with this base file name. For example, '--basename foo' | |
10003 would result in the files 'foo.bam' and 'foo_SE_report.txt' (or its paired-end equivalent). Takes | |
10004 precedence over --prefix. | |
10005 | |
10006 --old_flag Only in paired-end SAM mode, uses the FLAG values used by Bismark v0.8.2 and before. In addition, | |
10007 this options appends /1 and /2 to the read IDs for reads 1 and 2 relative to the input file. Since | |
10008 both the appended read IDs and custom FLAG values may cause problems with some downstream tools | |
10009 such as Picard, new defaults were implemented as of version 0.8.3. | |
10010 | |
10011 | |
10012 default old_flag | |
10013 =================== =================== | |
10014 Read 1 Read 2 Read 1 Read 2 | |
10015 | |
10016 OT: 99 147 67 131 | |
10017 | |
10018 OB: 83 163 115 179 | |
10019 | |
10020 CTOT: 147 99 67 131 | |
10021 | |
10022 CTOB: 163 83 115 179 | |
10023 | |
10024 --ambig_bam For reads that have multiple alignments a random alignment is written out to a special file ending in | |
10025 '.ambiguous.bam'. The alignments are in Bowtie2 format and do not any contain Bismark specific | |
10026 entries such as the methylation call etc. These ambiguous BAM files are intended to be used as | |
10027 coverage estimators for variant callers. | |
10028 | |
10029 --nucleotide_coverage Calculates the mono- and di-nucleotide sequence composition of covered positions in the analysed BAM | |
10030 file and compares it to the genomic average composition once alignments are complete by calling 'bam2nuc'. | |
10031 Since this calculation may take a while, bam2nuc attempts to write the genomic sequence composition | |
10032 into a file called 'genomic_nucleotide_frequencies.txt' indside the reference genome folder so it can | |
10033 be re-used the next time round instead of calculating it once again. If a file 'nucleotide_stats.txt' is | |
10034 found with the Bismark reports it will be automatically detected and used for the Bismark HTML report. | |
10035 This option works only for BAM or CRAM files. | |
10036 | |
10037 | |
10038 Other: | |
10039 | |
10040 -h/--help Displays this help file. | |
10041 | |
10042 -v/--version Displays version information. | |
10043 | |
10044 | |
10045 BOWTIE 2 SPECIFIC OPTIONS | |
10046 | |
10047 --bowtie1 Uses Bowtie 1 instead of Bowtie 2, which might be a good choice for faster and very short | |
10048 alignments. Bismark assumes that raw sequence data is adapter and/or quality trimmed where | |
10049 appropriate. Default: off. | |
10050 | |
10051 --bowtie2 Default: ON. Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end | |
10052 alignments, i.e. searches for alignments involving all read characters (also called | |
10053 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter | |
10054 and/or quality trimmed where appropriate. Both small (.bt2) and large (.bt2l) Bowtie 2 | |
10055 indexes are supported. | |
10056 | |
10057 Bowtie 2 alignment options: | |
10058 | |
10059 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment. | |
10060 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower) | |
10061 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for | |
10062 Bowtie 1 see -n). | |
10063 | |
10064 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values | |
10065 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is | |
10066 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for | |
10067 Bowtie 1 see -l). | |
10068 | |
10069 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched | |
10070 position to be the highest possible, regardless of the actual value. I.e. input is treated | |
10071 as though all quality values are high. This is also the default behavior when the input | |
10072 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default. | |
10073 | |
10074 | |
10075 Bowtie 2 paired-end options: | |
10076 | |
10077 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if | |
10078 it cannot find a concordant or discordant alignment for a pair. This option is invariable and | |
10079 and on by default. | |
10080 | |
10081 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments. | |
10082 A discordant alignment is an alignment where both mates align uniquely, but that does not | |
10083 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior | |
10084 and it is on by default. | |
10085 | |
10086 --dovetail It is possible, though unusual, for the mates to "dovetail", with the mates seemingly extending | |
10087 "past" each other as in this example: | |
10088 | |
10089 Mate 1: GTCAGCTACGATATTGTTTGGGGTGACACATTACGC | |
10090 Mate 2: TATGAGTCAGCTACGATATTGTTTGGGGTGACACAT | |
10091 Reference: GCAGATTATATGAGTCAGCTACGATATTGTTTGGGGTGACACATTACGCGTCTTTGAC | |
10092 | |
10093 By default, dovetailing is considered inconsistent with concordant alignment, but setting --dovetail | |
10094 causes Bowtie 2 to consider dovetailing alignments as concordant. This becomes relevant whenever | |
10095 Reads are clipped from their 5' end prior to mapping, e.g. because of quality or bias issues. | |
10096 --dovetail is set automatically for PBAT libraries. | |
10097 | |
10098 | |
10099 Bowtie 2 effort options: | |
10100 | |
10101 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using | |
10102 the alignments found so far. A seed extension "fails" if it does not yield a new best or a | |
10103 new second-best alignment. Default: 15. | |
10104 | |
10105 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds. | |
10106 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of | |
10107 mismatches allowed) at different offsets and searches for more alignments. A read is considered | |
10108 to have repetitive seeds if the total number of seed hits divided by the number of seeds | |
10109 that aligned at least once is greater than 300. Default: 2. | |
10110 | |
10111 Bowtie 2 parallelization options: | |
10112 | |
10113 | |
10114 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores | |
10115 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly | |
10116 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. | |
10117 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint | |
10118 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads | |
10119 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will | |
10120 automatically use the option '--reorder', which guarantees that output SAM records are printed in | |
10121 an order corresponding to the order of the reads in the original input file, even when -p is set | |
10122 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and | |
10123 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then | |
10124 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally | |
10125 correspond to input order in that case. | |
10126 | |
10127 Bowtie 2 Scoring options: | |
10128 | |
10129 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered | |
10130 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying | |
10131 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length. | |
10132 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is | |
10133 L,0,-0.2. | |
10134 | |
10135 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty | |
10136 of <int1> + N * <int2>. Default: 5, 3. | |
10137 | |
10138 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets | |
10139 a penalty of <int1> + N * <int2>. Default: 5, 3. | |
10140 | |
10141 | |
10142 Bowtie 2 Reporting options: | |
10143 | |
10144 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is | |
10145 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the | |
10146 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the | |
10147 effort expended to find valid alignments. | |
10148 | |
10149 For reference, this used to be the old (now deprecated) description of -M: | |
10150 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it | |
10151 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever | |
10152 happens first. Only the best alignment is reported. Information from the other alignments is used to | |
10153 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes | |
10154 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that | |
10155 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not | |
10156 guarantee that the alignment reported is the best possible in terms of alignment score. -M is | |
10157 always used and its default value is set to 10. | |
10158 | |
10159 | |
10160 'VANILLA' Bismark OUTPUT: | |
10161 | |
10162 Single-end output format (tab-separated): | |
10163 | |
10164 (1) <seq-ID> | |
10165 (2) <read alignment strand> | |
10166 (3) <chromosome> | |
10167 (4) <start position> | |
10168 (5) <end position> | |
10169 (6) <observed bisulfite sequence> | |
10170 (7) <equivalent genomic sequence> | |
10171 (8) <methylation call> | |
10172 (9) <read conversion | |
10173 (10) <genome conversion> | |
10174 (11) <read quality score (Phred33)> | |
10175 | |
10176 | |
10177 Paired-end output format (tab-separated): | |
10178 (1) <seq-ID> | |
10179 (2) <read 1 alignment strand> | |
10180 (3) <chromosome> | |
10181 (4) <start position> | |
10182 (5) <end position> | |
10183 (6) <observed bisulfite sequence 1> | |
10184 (7) <equivalent genomic sequence 1> | |
10185 (8) <methylation call 1> | |
10186 (9) <observed bisulfite sequence 2> | |
10187 (10) <equivalent genomic sequence 2> | |
10188 (11) <methylation call 2> | |
10189 (12) <read 1 conversion | |
10190 (13) <genome conversion> | |
10191 (14) <read 1 quality score (Phred33)> | |
10192 (15) <read 2 quality score (Phred33)> | |
10193 | |
10194 | |
10195 Bismark SAM OUTPUT (default): | |
10196 | |
10197 (1) QNAME (seq-ID) | |
10198 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!)) | |
10199 (3) RNAME (chromosome) | |
10200 (4) POS (start position) | |
10201 (5) MAPQ (always 255 for use with Bowtie) | |
10202 (6) CIGAR | |
10203 (7) RNEXT | |
10204 (8) PNEXT | |
10205 (9) TLEN | |
10206 (10) SEQ | |
10207 (11) QUAL (Phred33 scale) | |
10208 (12) NM-tag (edit distance to the reference) | |
10209 (13) MD-tag (base-by-base mismatches to the reference (handles indels) | |
10210 (14) XM-tag (methylation call string) | |
10211 (15) XR-tag (read conversion state for the alignment) | |
10212 (16) XG-tag (genome conversion state for the alignment) | |
10213 (17) XA/XB-tag (non-bisulfite mismatches) (optional!) | |
10214 | |
10215 Each read of paired-end alignments is written out in a separate line in the above format. | |
10216 | |
10217 | |
10218 Last edited on 25 July 2016 | |
10219 HOW_TO | |
10220 } |