annotate bismark @ 4:427fb56f2e41 draft default tip

- new options - fixes
author bjoern-gruening
date Fri, 01 Mar 2013 13:39:22 -0500
parents 36d124f44c0a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1 #!/usr/bin/perl --
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2 use strict;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3 use warnings;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4 use IO::Handle;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5 use Cwd;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6 $|++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
7 use Getopt::Long;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
8
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
9
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
10 ## This program is Copyright (C) 2010-12, Felix Krueger (felix.krueger@babraham.ac.uk)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
11
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
12 ## This program is free software: you can redistribute it and/or modify
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
13 ## it under the terms of the GNU General Public License as published by
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
14 ## the Free Software Foundation, either version 3 of the License, or
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
15 ## (at your option) any later version.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
16
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
17 ## This program is distributed in the hope that it will be useful,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
20 ## GNU General Public License for more details.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
21
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
22 ## You should have received a copy of the GNU General Public License
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
24
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
25
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
26 my $parent_dir = getcwd;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
27 my $bismark_version = 'v0.7.7';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
28 my $command_line = join (" ",@ARGV);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
29
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
30 ### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
31 foreach my $arg (@ARGV){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
32 if ($arg eq '--solexa1.3-quals'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
33 $arg = '--phred64-quals';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
34 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
35 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
36 my @filenames; # will be populated by processing the command line
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
37
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
38 my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_file_format,$bowtie_options,$directional,$unmapped,$ambiguous,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$upto,$temp_dir) = process_command_line();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
39
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
40 my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
41 my %chromosomes; # stores the chromosome sequences of the mouse genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
42 my %counting; # counting various events
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
43
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
44 my $seqID_contains_tabs;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
45
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
46 foreach my $filename (@filenames){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
47
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
48 chdir $parent_dir or die "Unable to move to initial working directory $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
49 ### resetting the counting hash and fhs
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
50 reset_counters_and_fhs($filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
51 $seqID_contains_tabs = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
52
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
53 ### PAIRED-END ALIGNMENTS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
54 if ($filename =~ ','){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
55 my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
56
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
57 $fhs[0]->{name} = 'CTread1GAread2CTgenome';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
58 $fhs[1]->{name} = 'GAread1CTread2GAgenome';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
59 $fhs[2]->{name} = 'GAread1CTread2CTgenome';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
60 $fhs[3]->{name} = 'CTread1GAread2GAgenome';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
61
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
62 print "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
63
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
64 my ($filename_1,$filename_2) = (split (/,/,$filename));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
65 print "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
66
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
67 ### additional variables only for paired-end alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
68 my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
69
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
70 ### FastA format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
71 if ($sequence_file_format eq 'FASTA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
72 print "Input files are in FastA format\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
73
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
74 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
75 ($C_to_T_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
76 ($G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
77
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
78 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
79 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
80 $fhs[1]->{inputfile_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
81 $fhs[1]->{inputfile_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
82 $fhs[2]->{inputfile_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
83 $fhs[2]->{inputfile_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
84 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
85 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
86 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
87 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
88 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
89 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastAFiles_paired_end ($filename_2,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
90
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
91 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
92 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
93 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
94 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
95 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
96 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
97 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
98 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
99 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
100
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
101 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
102 paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
103 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
104 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
105 paired_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
106 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
107 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
108
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
109 ### FastQ format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
110 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
111 print "Input files are in FastQ format\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
112 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
113 ($C_to_T_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
114 ($G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
115
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
116 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
117 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
118 $fhs[1]->{inputfile_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
119 $fhs[1]->{inputfile_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
120 $fhs[2]->{inputfile_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
121 $fhs[2]->{inputfile_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
122 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
123 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
124 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
125 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
126 ($C_to_T_infile_1,$G_to_A_infile_1) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
127 ($C_to_T_infile_2,$G_to_A_infile_2) = biTransformFastQFiles_paired_end ($filename_2,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
128
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
129 $fhs[0]->{inputfile_1} = $C_to_T_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
130 $fhs[0]->{inputfile_2} = $G_to_A_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
131 $fhs[1]->{inputfile_1} = $G_to_A_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
132 $fhs[1]->{inputfile_2} = $C_to_T_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
133 $fhs[2]->{inputfile_1} = $G_to_A_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
134 $fhs[2]->{inputfile_2} = $C_to_T_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
135 $fhs[3]->{inputfile_1} = $C_to_T_infile_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
136 $fhs[3]->{inputfile_2} = $G_to_A_infile_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
137 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
138
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
139 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
140 paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
141 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
142 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
143 paired_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
144 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
145 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
146 start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
147 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
148
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
149 ### Else we are performing SINGLE-END ALIGNMENTS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
150 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
151 print "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
152 ### Initialising bisulfite conversion filenames
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
153 my ($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
154
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
155
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
156 ### FastA format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
157 if ($sequence_file_format eq 'FASTA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
158 print "Inut file is in FastA format\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
159 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
160 ($C_to_T_infile) = biTransformFastAFiles ($filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
161 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
162 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
163 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
164 ($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
165 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
166 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
167 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
168
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
169 ### Creating 4 different bowtie filehandles and storing the first entry
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
170 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
171 single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
172 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
173 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
174 single_end_align_fragments_to_bisulfite_genome_fastA ($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
175 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
176 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
177
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
178 ## FastQ format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
179 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
180 print "Input file is in FastQ format\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
181 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
182 ($C_to_T_infile) = biTransformFastQFiles ($filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
183 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
184 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
185 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
186 ($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
187 $fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
188 $fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
189 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
190
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
191 ### Creating 4 different bowtie filehandles and storing the first entry
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
192 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
193 single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
194 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
195 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
196 single_end_align_fragments_to_bisulfite_genome_fastQ ($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
197 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
198 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
199
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
200 start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
201
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
202 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
203 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
204
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
205 sub start_methylation_call_procedure_single_ends {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
206 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
207 my ($dir,$filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
208
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
209 if ($sequence_file =~ /\//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
210 ($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
211 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
212 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
213 $filename = $sequence_file;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
214 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
215
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
216 ### printing all alignments to a results file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
217 my $outfile = $filename;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
218
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
219 if ($bowtie2){ # SAM format is the default for Bowtie 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
220 $outfile =~ s/$/_bt2_bismark.sam/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
221 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
222 elsif ($vanilla){ # vanilla custom Bismark output single-end output (like Bismark versions 0.5.X)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
223 $outfile =~ s/$/_bismark.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
224 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
225 else{ # SAM is the default output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
226 $outfile =~ s/$/_bismark.sam/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
227 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
228 print "Writing bisulfite mapping results to $output_dir$outfile\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
229 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
230 if ($vanilla){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
231 print OUT "Bismark version: $bismark_version\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
232 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
233
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
234 ### printing alignment and methylation call summary to a report file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
235 my $reportfile = $filename;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
236 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
237 $reportfile =~ s/$/_bt2_Bismark_mapping_report.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
238 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
239 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
240 $reportfile =~ s/$/_Bismark_mapping_report.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
241 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
242
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
243 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
244 print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
245
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
246 if ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
247 my $unmapped_file = $filename;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
248 $unmapped_file =~ s/$/_unmapped_reads.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
249 open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
250 print "Unmapped sequences will be written to $output_dir$unmapped_file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
251 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
252 if ($ambiguous){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
253 my $ambiguous_file = $filename;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
254 $ambiguous_file =~ s/$/_ambiguous_reads.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
255 open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
256 print "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
257 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
258
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
259 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
260 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed!)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
261 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
262 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
263
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
264
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
265 ### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
266 unless (%chromosomes){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
267 my $cwd = getcwd; # storing the path of the current working directory
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
268 print "Current working directory is: $cwd\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
269 read_genome_into_memory($cwd);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
270 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
271
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
272 unless ($vanilla or $sam_no_hd){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
273 generate_SAM_header();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
274 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
275
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
276 ### Input file is in FastA format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
277 if ($sequence_file_format eq 'FASTA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
278 process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
279 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
280 ### Input file is in FastQ format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
281 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
282 process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
283 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
284 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
285
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
286 sub start_methylation_call_procedure_paired_ends {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
287 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
288
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
289 my ($dir_1,$filename_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
290
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
291 if ($sequence_file_1 =~ /\//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
292 ($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
293 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
294 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
295 $filename_1 = $sequence_file_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
296 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
297
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
298 my ($dir_2,$filename_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
299
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
300 if ($sequence_file_2 =~ /\//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
301 ($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
302 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
303 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
304 $filename_2 = $sequence_file_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
305 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
306
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
307 ### printing all alignments to a results file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
308 my $outfile = $filename_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
309 if ($bowtie2){ # SAM format is the default Bowtie 2 output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
310 $outfile =~ s/$/_bismark_bt2_pe.sam/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
311 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
312 elsif ($vanilla){ # vanilla custom Bismark paired-end output (like Bismark versions 0.5.X)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
313 $outfile =~ s/$/_bismark_pe.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
314 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
315 else{ # SAM format is the default Bowtie 1 output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
316 $outfile =~ s/$/_bismark_pe.sam/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
317 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
318
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
319 print "Writing bisulfite mapping results to $outfile\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
320 open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
321 if ($vanilla){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
322 print OUT "Bismark version: $bismark_version\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
323 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
324
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
325 ### printing alignment and methylation call summary to a report file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
326 my $reportfile = $filename_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
327 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
328 $reportfile =~ s/$/_Bismark_bt2_paired-end_mapping_report.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
329 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
330 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
331 $reportfile =~ s/$/_Bismark_paired-end_mapping_report.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
332 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
333
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
334 open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
335 print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
336 print REPORT "Bowtie was run against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
337
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
338
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
339 ### Unmapped read output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
340 if ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
341 my $unmapped_1 = $filename_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
342 my $unmapped_2 = $filename_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
343 $unmapped_1 =~ s/$/_unmapped_reads_1.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
344 $unmapped_2 =~ s/$/_unmapped_reads_2.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
345 open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
346 open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
347 print "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
348 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
349
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
350 if ($ambiguous){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
351 my $amb_1 = $filename_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
352 my $amb_2 = $filename_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
353 $amb_1 =~ s/$/_ambiguous_reads_1.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
354 $amb_2 =~ s/$/_ambiguous_reads_2.txt/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
355 open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
356 open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
357 print "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
358 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
359
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
360 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
361 print REPORT "Option '--directional' specified: alignments to complementary strands will be ignored (i.e. not performed)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
362 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
363
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
364 ### if 2 or more files are provided we might still hold the genome in memory and don't need to read it in a second time
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
365 unless (%chromosomes){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
366 my $cwd = getcwd; # storing the path of the current working directory
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
367 print "Current working directory is: $cwd\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
368 read_genome_into_memory($cwd);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
369 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
370
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
371 unless ($vanilla or $sam_no_hd){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
372 generate_SAM_header();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
373 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
374
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
375 ### Input files are in FastA format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
376 if ($sequence_file_format eq 'FASTA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
377 process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
378 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
379 ### Input files are in FastQ format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
380 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
381 process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
382 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
383 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
384
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
385 sub print_final_analysis_report_single_end{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
386 my ($C_to_T_infile,$G_to_A_infile) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
387 ### All sequences from the original sequence file have been analysed now
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
388 ### deleting temporary C->T or G->A infiles
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
389
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
390 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
391 my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
392 if ($deletion_successful == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
393 warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
394 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
395 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
396 warn "Could not delete temporary file $C_to_T_infile properly $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
397 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
398 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
399
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
400 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
401 my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
402 if ($deletion_successful == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
403 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
404 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
405 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
406 warn "Could not delete temporary files properly $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
407 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
408 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
409
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
410 ### printing a final report for the alignment procedure
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
411 print REPORT "Final Alignment report\n",'='x22,"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
412 print "Final Alignment report\n",'='x22,"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
413 # foreach my $index (0..$#fhs){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
414 # print "$fhs[$index]->{name}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
415 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
416 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
417 # }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
418
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
419 ### printing a final report for the methylation call procedure
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
420 warn "Sequences analysed in total:\t$counting{sequences_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
421 print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
422 my $percent_alignable_sequences;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
423
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
424 if ($counting{sequences_count} == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
425 $percent_alignable_sequences = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
426 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
427 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
428 $percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
429 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
430
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
431 warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
432 print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
433
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
434 ### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
435 ### only calculating the percentage if there were any overruled alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
436 if ($counting{low_complexity_alignments_overruled_count}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
437 my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
438 # print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
439 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
440
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
441 print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
442 print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
443 print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
444 print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
445 print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
446
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
447 print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
448 print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
449 print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
450 print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
451 print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
452
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
453 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
454 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
455 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
456 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
457
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
458 ### detailed information about Cs analysed
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
459 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
460 my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
461 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
462 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
463 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
464 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
465 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
466 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
467 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
468
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
469 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
470 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
471 print REPORT "Total methylated C's in CpG context:\t $counting{total_meCpG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
472 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
473 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
474 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
475 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
476 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
477
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
478 my $percent_meCHG;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
479 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
480 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
481 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
482
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
483 my $percent_meCHH;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
484 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
485 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
486 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
487
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
488 my $percent_meCpG;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
489 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
490 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
491 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
492
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
493 ### printing methylated CpG percentage if applicable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
494 if ($percent_meCpG){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
495 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
496 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
497 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
498 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
499 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
500 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
501 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
502
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
503 ### printing methylated C percentage (CHG context) if applicable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
504 if ($percent_meCHG){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
505 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
506 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
507 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
508 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
509 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
510 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
511 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
512
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
513 ### printing methylated C percentage (CHH context) if applicable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
514 if ($percent_meCHH){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
515 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
516 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
517 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
518 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
519 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
520 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
521 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
522
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
523 if ($seqID_contains_tabs){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
524 warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
525 print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
526 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
527 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
528
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
529 sub print_final_analysis_report_paired_ends{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
530 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
531 ### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
532 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
533 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
534 if ($deletion_successful == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
535 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
536 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
537 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
538 warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
539 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
540 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
541 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
542 my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
543 if ($deletion_successful == 4){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
544 warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
545 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
546 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
547 warn "Could not delete temporary files properly: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
548 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
549 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
550
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
551 ### printing a final report for the alignment procedure
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
552 warn "Final Alignment report\n",'='x22,"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
553 print REPORT "Final Alignment report\n",'='x22,"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
554 # foreach my $index (0..$#fhs){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
555 # print "$fhs[$index]->{name}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
556 # print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
557 # print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
558 # }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
559
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
560 ### printing a final report for the methylation call procedure
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
561 warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
562 print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
563
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
564 my $percent_alignable_sequence_pairs;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
565 if ($counting{sequences_count} == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
566 $percent_alignable_sequence_pairs = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
567 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
568 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
569 $percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
570 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
571 print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
572 print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
573
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
574 print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
575 print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
576 print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
577 print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
578 print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
579
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
580
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
581 print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
582 print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
583 print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
584 print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
585 print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
586 ### detailed information about Cs analysed
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
587
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
588 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
589 print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
590 print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
591 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
592
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
593 warn "Final Cytosine Methylation Report\n",'='x33,"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
594 print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
595
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
596 my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
597 warn "Total number of C's analysed:\t$total_number_of_C\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
598 warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
599 warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
600 warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
601 warn "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
602 warn "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
603 warn "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
604
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
605 print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
606 print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
607 print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
608 print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
609 print REPORT "Total C to T conversions in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
610 print REPORT "Total C to T conversions in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
611 print REPORT "Total C to T conversions in CHH context:\t$counting{total_unmethylated_CHH_count}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
612
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
613 my $percent_meCHG;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
614 if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
615 $percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
616 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
617
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
618 my $percent_meCHH;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
619 if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
620 $percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
621 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
622
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
623 my $percent_meCpG;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
624 if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
625 $percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
626 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
627
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
628 ### printing methylated CpG percentage if applicable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
629 if ($percent_meCpG){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
630 warn "C methylated in CpG context:\t${percent_meCpG}%\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
631 print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
632 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
633 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
634 warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
635 print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
636 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
637
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
638 ### printing methylated C percentage in CHG context if applicable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
639 if ($percent_meCHG){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
640 warn "C methylated in CHG context:\t${percent_meCHG}%\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
641 print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
642 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
643 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
644 warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
645 print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
646 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
647
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
648 ### printing methylated C percentage in CHH context if applicable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
649 if ($percent_meCHH){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
650 warn "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
651 print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
652 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
653 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
654 warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
655 print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
656 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
657
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
658 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
659
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
660 sub process_single_end_fastA_file_for_methylation_call{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
661 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
662 ### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
663 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
664 ### the C->T or G->A version
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
665
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
666 ### gzipped version of the infile
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
667 if ($sequence_file =~ /\.gz$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
668 open (IN,"zcat $sequence_file |") or die $!;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
669 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
670 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
671 open (IN,$sequence_file) or die $!;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
672 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
673
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
674 my $count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
675
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
676 warn "\nReading in the sequence file $sequence_file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
677 while (1) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
678 # last if ($counting{sequences_count} > 100);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
679 my $identifier = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
680 my $sequence = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
681 last unless ($identifier and $sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
682
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
683 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
684
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
685 ++$count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
686
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
687 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
688 next unless ($count > $skip);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
689 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
690 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
691 last if ($count > $upto);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
692 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
693
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
694 $counting{sequences_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
695 if ($counting{sequences_count}%100000==0) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
696 warn "Processed $counting{sequences_count} sequences so far\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
697 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
698 chomp $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
699 chomp $identifier;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
700
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
701 $identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
702
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
703 my $return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
704 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
705 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
706 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
707 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
708 $return = check_bowtie_results_single_end(uc$sequence,$identifier); # default Bowtie 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
709 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
710
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
711 unless ($return){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
712 $return = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
713 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
714
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
715 # print the sequence to ambiguous.out if --ambiguous was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
716 if ($ambiguous and $return == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
717 print AMBIG ">$identifier\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
718 print AMBIG "$sequence\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
719 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
720
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
721 # print the sequence to <unmapped.out> file if --un was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
722 elsif ($unmapped and $return == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
723 print UNMAPPED ">$identifier\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
724 print UNMAPPED "$sequence\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
725 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
726 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
727 print "Processed $counting{sequences_count} sequences in total\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
728
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
729 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
730
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
731 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
732
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
733 sub process_single_end_fastQ_file_for_methylation_call{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
734 my ($sequence_file,$C_to_T_infile,$G_to_A_infile) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
735 ### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
736 ### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
737 ### the C->T or G->A version
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
738
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
739 ### gzipped version of the infile
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
740 if ($sequence_file =~ /\.gz$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
741 open (IN,"zcat $sequence_file |") or die $!;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
742 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
743 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
744 open (IN,$sequence_file) or die $!;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
745 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
746
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
747 my $count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
748
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
749 warn "\nReading in the sequence file $sequence_file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
750 while (1) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
751 my $identifier = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
752 my $sequence = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
753 my $identifier_2 = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
754 my $quality_value = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
755 last unless ($identifier and $sequence and $identifier_2 and $quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
756
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
757 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
758
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
759 ++$count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
760
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
761 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
762 next unless ($count > $skip);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
763 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
764 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
765 last if ($count > $upto);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
766 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
767
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
768 $counting{sequences_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
769
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
770 if ($counting{sequences_count}%1000000==0) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
771 warn "Processed $counting{sequences_count} sequences so far\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
772 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
773 chomp $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
774 chomp $identifier;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
775 chomp $quality_value;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
776
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
777 $identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
778
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
779 my $return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
780 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
781 $return = check_bowtie_results_single_end_bowtie2 (uc$sequence,$identifier,$quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
782 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
783 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
784 $return = check_bowtie_results_single_end(uc$sequence,$identifier,$quality_value); # default Bowtie 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
785 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
786
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
787 unless ($return){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
788 $return = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
789 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
790
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
791 # print the sequence to ambiguous.out if --ambiguous was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
792 if ($ambiguous and $return == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
793 print AMBIG "\@$identifier\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
794 print AMBIG "$sequence\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
795 print AMBIG $identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
796 print AMBIG "$quality_value\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
797 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
798
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
799 # print the sequence to <unmapped.out> file if --un was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
800 elsif ($unmapped and $return == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
801 print UNMAPPED "\@$identifier\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
802 print UNMAPPED "$sequence\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
803 print UNMAPPED $identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
804 print UNMAPPED "$quality_value\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
805 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
806 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
807 print "Processed $counting{sequences_count} sequences in total\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
808
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
809 print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
810
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
811 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
812
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
813 sub process_fastA_files_for_paired_end_methylation_calls{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
814 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
815 ### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
816 ### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
817 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
818 ### converted genomes (either the C->T or G->A version)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
819
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
820 ### gzipped version of the infiles
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
821 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
822 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
823 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
824 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
825 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
826 open (IN1,$sequence_file_1) or die $!;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
827 open (IN2,$sequence_file_2) or die $!;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
828 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
829
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
830 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
831 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
832
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
833 my $count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
834
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
835 while (1) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
836 # reading from the first input file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
837 my $identifier_1 = <IN1>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
838 my $sequence_1 = <IN1>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
839 # reading from the second input file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
840 my $identifier_2 = <IN2>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
841 my $sequence_2 = <IN2>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
842 last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
843
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
844 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
845 $identifier_2 = fix_IDs($identifier_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
846
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
847 ++$count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
848
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
849 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
850 next unless ($count > $skip);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
851 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
852 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
853 last if ($count > $upto);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
854 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
855
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
856 $counting{sequences_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
857 if ($counting{sequences_count}%100000==0) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
858 warn "Processed $counting{sequences_count} sequences so far\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
859 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
860 my $orig_identifier_1 = $identifier_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
861 my $orig_identifier_2 = $identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
862
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
863 chomp $sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
864 chomp $identifier_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
865 chomp $sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
866 chomp $identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
867
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
868 $identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
869
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
870 my $return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
871 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
872 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
873 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
874 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
875 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
876 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
877
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
878 unless ($return){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
879 $return = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
880 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
881
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
882 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
883 if ($ambiguous and $return == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
884 print AMBIG_1 $orig_identifier_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
885 print AMBIG_1 "$sequence_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
886 print AMBIG_2 $orig_identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
887 print AMBIG_2 "$sequence_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
888 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
889
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
890 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
891 elsif ($unmapped and $return == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
892 print UNMAPPED_1 $orig_identifier_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
893 print UNMAPPED_1 "$sequence_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
894 print UNMAPPED_2 $orig_identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
895 print UNMAPPED_2 "$sequence_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
896 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
897 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
898
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
899 print "Processed $counting{sequences_count} sequences in total\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
900
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
901 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
902
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
903 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
904
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
905 sub process_fastQ_files_for_paired_end_methylation_calls{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
906 my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
907 ### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
908 ### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
909 ### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
910 ### of the converted genomes (either C->T or G->A version)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
911
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
912 ### gzipped version of the infiles
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
913 if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
914 open (IN1,"zcat $sequence_file_1 |") or die "Failed to open zcat pipe to $sequence_file_1 $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
915 open (IN2,"zcat $sequence_file_2 |") or die "Failed to open zcat pipe to $sequence_file_2 $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
916 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
917 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
918 open (IN1,$sequence_file_1) or die $!;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
919 open (IN2,$sequence_file_2) or die $!;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
920 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
921
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
922 my $count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
923
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
924 warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
925 ### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
926 while (1) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
927 # reading from the first input file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
928 my $identifier_1 = <IN1>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
929 my $sequence_1 = <IN1>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
930 my $ident_1 = <IN1>; # not needed
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
931 my $quality_value_1 = <IN1>; # not needed
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
932 # reading from the second input file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
933 my $identifier_2 = <IN2>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
934 my $sequence_2 = <IN2>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
935 my $ident_2 = <IN2>; # not needed
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
936 my $quality_value_2 = <IN2>; # not needed
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
937 last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
938
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
939 $identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
940 $identifier_2 = fix_IDs($identifier_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
941
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
942 ++$count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
943
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
944 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
945 next unless ($count > $skip);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
946 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
947 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
948 last if ($count > $upto);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
949 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
950
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
951 $counting{sequences_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
952 if ($counting{sequences_count}%100000==0) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
953 warn "Processed $counting{sequences_count} sequences so far\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
954 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
955
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
956 my $orig_identifier_1 = $identifier_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
957 my $orig_identifier_2 = $identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
958
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
959 chomp $sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
960 chomp $identifier_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
961 chomp $sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
962 chomp $identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
963 chomp $quality_value_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
964 chomp $quality_value_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
965
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
966 $identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
967
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
968 my $return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
969 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
970 $return = check_bowtie_results_paired_ends_bowtie2 (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
971 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
972 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
973 $return = check_bowtie_results_paired_ends (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
974 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
975
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
976 unless ($return){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
977 $return = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
978 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
979
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
980 # print the sequences to ambiguous_1 and _2 if --ambiguous was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
981 if ($ambiguous and $return == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
982 # seq_1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
983 print AMBIG_1 $orig_identifier_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
984 print AMBIG_1 "$sequence_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
985 print AMBIG_1 $ident_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
986 print AMBIG_1 "$quality_value_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
987 # seq_2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
988 print AMBIG_2 $orig_identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
989 print AMBIG_2 "$sequence_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
990 print AMBIG_2 $ident_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
991 print AMBIG_2 "$quality_value_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
992 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
993
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
994 # print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
995 elsif ($unmapped and $return == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
996 # seq_1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
997 print UNMAPPED_1 $orig_identifier_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
998 print UNMAPPED_1 "$sequence_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
999 print UNMAPPED_1 $ident_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1000 print UNMAPPED_1 "$quality_value_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1001 # seq_2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1002 print UNMAPPED_2 $orig_identifier_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1003 print UNMAPPED_2 "$sequence_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1004 print UNMAPPED_2 $ident_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1005 print UNMAPPED_2 "$quality_value_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1006 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1007 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1008
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1009 print "Processed $counting{sequences_count} sequences in total\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1010
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1011 print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1012
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1013 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1014
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1015 sub check_bowtie_results_single_end{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1016 my ($sequence,$identifier,$quality_value) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1017
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1018 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1019 $quality_value = 'I'x(length$sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1020 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1021
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1022 my %mismatches = ();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1023 ### reading from the bowtie output files to see if this sequence aligned to a bisulfite converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1024 foreach my $index (0..$#fhs){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1025
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1026 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1027 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1028 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1029 if ($fhs[$index]->{last_seq_id} eq $identifier) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1030 ###############################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1031 ### STEP I Now processing the alignment stored in last_line ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1032 ###############################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1033 my $valid_alignment_found_1 = decide_whether_single_end_alignment_is_valid($index,$identifier);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1034 ### sequences can fail at this point if there was only 1 seq in the wrong orientation, or if there were 2 seqs, both in the wrong orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1035 ### we only continue to extract useful information about this alignment if 1 was returned
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1036 if ($valid_alignment_found_1 == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1037 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1038 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1039 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1040
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1041 unless($mismatch_info){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1042 $mismatch_info = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1043 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1044
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1045 chomp $mismatch_info;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1046 my $chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1047 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1048 $chromosome = $mapped_chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1049 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1050 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1051 die "Chromosome number extraction failed for $mapped_chromosome\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1052 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1053 ### Now extracting the number of mismatches to the converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1054 my $number_of_mismatches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1055 if ($mismatch_info eq ''){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1056 $number_of_mismatches = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1057 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1058 elsif ($mismatch_info =~ /^\d/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1059 my @mismatches = split (/,/,$mismatch_info);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1060 $number_of_mismatches = scalar @mismatches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1061 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1062 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1063 die "Something weird is going on with the mismatch field:\t>>> $mismatch_info <<<\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1064 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1065 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1066 my $alignment_location = join (":",$chromosome,$position);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1067 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1068 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1069 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1070 ### number for the found alignment)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1071 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1072 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1073 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1074 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1075 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1076 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1077 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1078 $number_of_mismatches = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1079 ##################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1080 ### STEP II Now reading in the next line from the bowtie filehandle. The next alignment can either be a second alignment of the same sequence or a
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1081 ### a new sequence. In either case we will store the next line in @fhs ->{last_line}. In case the alignment is already the next entry, a 0 will
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1082 ### be returned as $valid_alignment_found and it will then be processed in the next round only.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1083 ##################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1084 my $newline = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1085 if ($newline){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1086 my ($seq_id) = split (/\t/,$newline);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1087 $fhs[$index]->{last_seq_id} = $seq_id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1088 $fhs[$index]->{last_line} = $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1089 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1090 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1091 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1092 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1093 $fhs[$index]->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1094 next;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1095 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1096 my $valid_alignment_found_2 = decide_whether_single_end_alignment_is_valid($index,$identifier);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1097 ### we only continue to extract useful information about this second alignment if 1 was returned
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1098 if ($valid_alignment_found_2 == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1099 ### If the second Bowtie output made it this far it is in the correct orientation, so we can continue to analyse the alignment itself
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1100 ### need to extract the chromosome number from the bowtie output (which is either XY_cf (complete forward) or XY_cr (complete reverse)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1101 my ($id,$strand,$mapped_chromosome,$position,$bowtie_sequence,$mismatch_info) = (split (/\t/,$fhs[$index]->{last_line},-1))[0,1,2,3,4,7];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1102 unless($mismatch_info){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1103 $mismatch_info = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1104 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1105 chomp $mismatch_info;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1106
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1107 my $chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1108 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1109 $chromosome = $mapped_chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1110 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1111 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1112 die "Chromosome number extraction failed for $mapped_chromosome\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1113 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1114
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1115 ### Now extracting the number of mismatches to the converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1116 my $number_of_mismatches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1117 if ($mismatch_info eq ''){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1118 $number_of_mismatches = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1119 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1120 elsif ($mismatch_info =~ /^\d/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1121 my @mismatches = split (/,/,$mismatch_info);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1122 $number_of_mismatches = scalar @mismatches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1123 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1124 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1125 die "Something weird is going on with the mismatch field\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1126 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1127 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1128 ### extracting the chromosome number from the bowtie output (see above)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1129 my $alignment_location = join (":",$chromosome,$position);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1130 ### In the special case that two differently converted sequences align against differently converted genomes, but to the same position
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1131 ### with the same number of mismatches (or perfect matches), the chromosome, position and number of mismatches are the same. In this
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1132 ### case we are not writing the same entry out a second time.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1133 unless (exists $mismatches{$number_of_mismatches}->{$alignment_location}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1134 $mismatches{$number_of_mismatches}->{$alignment_location}->{seq_id}=$id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1135 $mismatches{$number_of_mismatches}->{$alignment_location}->{bowtie_sequence}=$bowtie_sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1136 $mismatches{$number_of_mismatches}->{$alignment_location}->{index}=$index;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1137 $mismatches{$number_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1138 $mismatches{$number_of_mismatches}->{$alignment_location}->{position}=$position;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1139 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1140 ####################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1141 #### STEP III Now reading in one more line which has to be the next alignment to be analysed. Adding it to @fhs ->{last_line} ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1142 ####################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1143 $newline = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1144 if ($newline){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1145 my ($seq_id) = split (/\t/,$newline);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1146 die "The same seq ID occurred more than twice in a row\n" if ($seq_id eq $identifier);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1147 $fhs[$index]->{last_seq_id} = $seq_id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1148 $fhs[$index]->{last_line} = $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1149 next;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1150 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1151 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1152 # assigning undef to last_seq_id and last_line and jumping to the next index (end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1153 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1154 $fhs[$index]->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1155 next;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1156 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1157 ### still within the 2nd sequence in correct orientation found
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1158 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1159 ### still withing the 1st sequence in correct orientation found
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1160 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1161 ### still within the if (last_seq_id eq identifier) condition
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1162 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1163 ### still within foreach index loop
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1164 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1165 ### if there was not a single alignment found for a certain sequence we will continue with the next sequence in the sequence file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1166 unless(%mismatches){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1167 $counting{no_single_alignment_found}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1168 if ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1169 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1170 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1171 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1172 return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1173 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1174 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1175 #######################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1176 #######################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1177 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1178 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1179 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1180 #######################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1181 #######################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1182 ### Going to use the variable $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1183 my $sequence_fails = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1184 ### Declaring an empty hash reference which will store all information we need for the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1185 my $methylation_call_params; # hash reference!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1186 ### sorting in ascending order
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1187 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1188
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1189 ### if there is only 1 entry in the hash with the lowest number of mismatches we accept it as the best alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1190 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1191 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1192 $methylation_call_params->{$identifier}->{bowtie_sequence} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1193 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1194 $methylation_call_params->{$identifier}->{position} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{position};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1195 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1196 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1197 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1198 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1199 elsif (scalar keys %{$mismatches{$mismatch_number}} == 3){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1200 ### If there are 3 sequences with the same number of lowest mismatches we can discriminate 2 cases: (i) all 3 alignments are unique best hits and
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1201 ### come from different alignments processes (== indices) or (ii) one sequence alignment (== index) will give a unique best alignment, whereas a
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1202 ### second one will produce 2 (or potentially many) alignments for the same sequence but in a different conversion state or against a different genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1203 ### version (or both). This becomes especially relevant for highly converted sequences in which all Cs have been converted to Ts in the bisulfite
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1204 ### reaction. E.g.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1205 ### CAGTCACGCGCGCGCG will become
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1206 ### TAGTTATGTGTGTGTG in the CT transformed version, which will ideally still give the correct alignment in the CT->CT alignment condition.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1207 ### If the same read will then become G->A transformed as well however, the resulting sequence will look differently and potentially behave
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1208 ### differently in a GA->GA alignment and this depends on the methylation state of the original sequence!:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1209 ### G->A conversion:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1210 ### highly methylated: CAATCACACACACACA
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1211 ### highly converted : TAATTATATATATATA <== this sequence has a reduced complexity (only 2 bases left and not 3), and it is more likely to produce
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1212 ### an alignment with a low complexity genomic region than the one above. This would normally lead to the entire sequence being kicked out as the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1213 ### there will be 3 alignments with the same number of lowest mismatches!! This in turn means that highly methylated and thereby not converted
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1214 ### sequences are more likely to pass the alignment step, thereby creating a bias for methylated reads compared to their non-methylated counterparts.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1215 ### We do not want any bias, whatsover. Therefore if we have 1 sequence producing a unique best alignment and the second and third conditions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1216 ### producing alignments only after performing an additional (theoretical) conversion we want to keep the best alignment with the lowest number of
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1217 ### additional transliterations performed. Thus we want to have a look at the level of complexity of the sequences producing the alignment.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1218 ### In the above example the number of transliterations required to transform the actual sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1219 ### to the C->T version would be TAGTTATGTGTGTGTG -> TAGTTATGTGTGTGTG = 0; (assuming this gives the correct alignment)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1220 ### in the G->A case it would be TAGTTATGTGTGTGTG -> TAATTATATATATATA = 6; (assuming this gives multiple wrong alignments)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1221 ### if the sequence giving a unique best alignment required a lower number of transliterations than the second best sequence yielding alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1222 ### while requiring a much higher number of transliterations, we are going to accept the unique best alignment with the lowest number of performed
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1223 ### transliterations. As a threshold which does scale we will start with the number of tranliterations of the lowest best match x 2 must still be
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1224 ### smaller than the number of tranliterations of the second best sequence. Everything will be flagged with $sequence_fails = 1 and discarded.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1225 my @three_candidate_seqs;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1226 foreach my $composite_location (keys (%{$mismatches{$mismatch_number}}) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1227 my $transliterations_performed;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1228 if ($mismatches{$mismatch_number}->{$composite_location}->{index} == 0 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1229 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'CT');
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1230 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1231 elsif ($mismatches{$mismatch_number}->{$composite_location}->{index} == 2 or $mismatches{$mismatch_number}->{$composite_location}->{index} == 3){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1232 $transliterations_performed = determine_number_of_transliterations_performed($sequence,'GA');
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1233 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1234 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1235 die "unexpected index number range $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1236 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1237 push @three_candidate_seqs,{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1238 index =>$mismatches{$mismatch_number}->{$composite_location}->{index},
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1239 bowtie_sequence => $mismatches{$mismatch_number}->{$composite_location}->{bowtie_sequence},
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1240 mismatch_number => $mismatch_number,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1241 chromosome => $mismatches{$mismatch_number}->{$composite_location}->{chromosome},
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1242 position => $mismatches{$mismatch_number}->{$composite_location}->{position},
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1243 seq_id => $mismatches{$mismatch_number}->{$composite_location}->{seq_id},
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1244 transliterations_performed => $transliterations_performed,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1245 };
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1246 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1247 ### sorting in ascending order for the lowest number of transliterations performed
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1248 @three_candidate_seqs = sort {$a->{transliterations_performed} <=> $b->{transliterations_performed}} @three_candidate_seqs;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1249 my $first_array_element = $three_candidate_seqs[0]->{transliterations_performed};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1250 my $second_array_element = $three_candidate_seqs[1]->{transliterations_performed};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1251 my $third_array_element = $three_candidate_seqs[2]->{transliterations_performed};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1252 # print "$first_array_element\t$second_array_element\t$third_array_element\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1253 if (($first_array_element*2) < $second_array_element){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1254 $counting{low_complexity_alignments_overruled_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1255 ### taking the index with the unique best hit and over ruling low complexity alignments with 2 hits
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1256 $methylation_call_params->{$identifier}->{bowtie_sequence} = $three_candidate_seqs[0]->{bowtie_sequence};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1257 $methylation_call_params->{$identifier}->{chromosome} = $three_candidate_seqs[0]->{chromosome};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1258 $methylation_call_params->{$identifier}->{position} = $three_candidate_seqs[0]->{position};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1259 $methylation_call_params->{$identifier}->{index} = $three_candidate_seqs[0]->{index};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1260 $methylation_call_params->{$identifier}->{number_of_mismatches} = $mismatch_number;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1261 # print "Overruled low complexity alignments! Using $first_array_element and disregarding $second_array_element and $third_array_element\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1262 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1263 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1264 $sequence_fails = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1265 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1266 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1267 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1268 $sequence_fails = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1269 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1270 ### after processing the alignment with the lowest number of mismatches we exit
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1271 last;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1272 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1273 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1274 if ($sequence_fails == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1275 $counting{unsuitable_sequence_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1276 if ($ambiguous){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1277 return 2; # => exits to next sequence, and prints it out to multiple_alignments.out if --ambiguous has been specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1278 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1279 if ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1280 return 1; # => exits to next sequence, and prints it out to unmapped.out if --un has been specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1281 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1282 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1283 return 0; # => exits to next sequence (default)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1284 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1285 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1286
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1287 ### --DIRECTIONAL
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1288 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1289 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1290 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1291 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1292 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1293 $counting{alignments_rejected_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1294 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1295 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1296 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1297
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1298 ### If the sequence has not been rejected so far it will have a unique best alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1299 $counting{unique_best_alignment_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1300 extract_corresponding_genomic_sequence_single_end($identifier,$methylation_call_params);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1301 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1302 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1303 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1304 $counting{genomic_sequence_could_not_be_extracted_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1305 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1306 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1307
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1308 ### otherwise we are set to perform the actual methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1309 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1310
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1311 print_bisulfite_mapping_result_single_end($identifier,$sequence,$methylation_call_params,$quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1312 return 0; ## otherwise 1 will be returned by default, which would print the sequence to unmapped.out
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1313 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1314
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1315 sub check_bowtie_results_single_end_bowtie2{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1316 my ($sequence,$identifier,$quality_value) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1317
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1318 unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1319 $quality_value = 'I'x(length$sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1320 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1321
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1322 # as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1323 # $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1324
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1325 my $alignment_ambiguous = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1326
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1327 my %alignments = ();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1328
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1329 ### reading from the Bowtie 2 output filehandles
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1330 foreach my $index (0..$#fhs){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1331 # print "Index: $index\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1332 # print "$fhs[$index]->{last_line}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1333 # print "$fhs[$index]->{last_seq_id}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1334
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1335 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1336 next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1337
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1338 ### if the sequence we are currently looking at produced an alignment we are doing various things with it
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1339 # print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1340
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1341 if ($fhs[$index]->{last_seq_id} eq $identifier) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1342
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1343 # SAM format specifications for Bowtie 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1344 # (1) Name of read that aligned
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1345 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1346 # 1 The read is one of a pair
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1347 # 2 The alignment is one end of a proper paired-end alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1348 # 4 The read has no reported alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1349 # 8 The read is one of a pair and has no reported alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1350 # 16 The alignment is to the reverse reference strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1351 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1352 # 64 The read is mate 1 in a pair
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1353 # 128 The read is mate 2 in a pair
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1354 # 256 The read has multiple mapping states
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1355 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1356 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1357 # (5) Mapping quality (255 means MAPQ is not available)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1358 # (6) CIGAR string representation of alignment (* if unavailable)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1359 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1360 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1361 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1362 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1363 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1364 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1365 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1366 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1367 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1368 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1369 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1370 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1371 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1372 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1373 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1374 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1375
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1376 my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1377
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1378 ### If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1379 if ($flag == 4){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1380 ## reading in the next alignment, which must be the next sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1381 my $newline = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1382 if ($newline){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1383 chomp $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1384 my ($seq_id) = split (/\t/,$newline);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1385 $fhs[$index]->{last_seq_id} = $seq_id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1386 $fhs[$index]->{last_line} = $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1387 if ($seq_id eq $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1388 die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1389 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1390 next; # next instance
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1391 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1392 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1393 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1394 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1395 $fhs[$index]->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1396 next;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1397 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1398 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1399
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1400 # if there are one or more proper alignments we can extract the chromosome number
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1401 my $chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1402 if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1403 $chromosome = $mapped_chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1404 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1405 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1406 die "Chromosome number extraction failed for $mapped_chromosome\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1407 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1408
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1409 ### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1410 my ($alignment_score,$second_best,$MD_tag);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1411 my @fields = split (/\t/,$fhs[$index]->{last_line});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1412
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1413 foreach (11..$#fields){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1414 if ($fields[$_] =~ /AS:i:(.*)/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1415 $alignment_score = $1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1416 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1417 elsif ($fields[$_] =~ /XS:i:(.*)/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1418 $second_best = $1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1419 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1420 elsif ($fields[$_] =~ /MD:Z:(.*)/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1421 $MD_tag = $1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1422 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1423 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1424
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1425 # warn "First best alignment_score is: '$alignment_score'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1426 # warn "MD tag is: '$MD_tag'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1427 die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag)!\n" unless (defined $alignment_score and defined $MD_tag);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1428
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1429 if (defined $second_best){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1430 # warn "second best alignment_score is: '$second_best'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1431
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1432 # If the first alignment score is the same as the alignment score of the second best hit we are going to boot this sequence altogether
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1433 if ($alignment_score == $second_best){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1434 $alignment_ambiguous = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1435 ## need to read and discard all additional ambiguous reads until we reach the next sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1436 until ($fhs[$index]->{last_seq_id} ne $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1437 my $newline = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1438 if ($newline){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1439 chomp $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1440 my ($seq_id) = split (/\t/,$newline);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1441 $fhs[$index]->{last_seq_id} = $seq_id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1442 $fhs[$index]->{last_line} = $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1443 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1444 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1445 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1446 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1447 $fhs[$index]->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1448 last; # break free in case we have reached the end of the alignment output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1449 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1450 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1451 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1452 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1453 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1454
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1455 my $alignment_location = join (":",$chromosome,$position);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1456
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1457 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1458 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1459 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1460 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1461
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1462 unless (exists $alignments{$alignment_location}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1463 $alignments{$alignment_location}->{seq_id} = $id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1464 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1465 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1466 $alignments{$alignment_location}->{index} = $index;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1467 $alignments{$alignment_location}->{chromosome} = $chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1468 $alignments{$alignment_location}->{position} = $position;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1469 $alignments{$alignment_location}->{CIGAR} = $cigar;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1470 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1471 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1472
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1473 ### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1474 until ($fhs[$index]->{last_seq_id} ne $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1475 my $newline = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1476 if ($newline){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1477 chomp $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1478 my ($seq_id) = split (/\t/,$newline);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1479 $fhs[$index]->{last_seq_id} = $seq_id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1480 $fhs[$index]->{last_line} = $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1481 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1482 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1483 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1484 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1485 $fhs[$index]->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1486 last; # break free in case we have reached the end of the alignment output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1487 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1488 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1489 # warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1490 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1491 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1492 else{ # there is no second best hit, so we can just store this one and read in the next sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1493
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1494 my $alignment_location = join (":",$chromosome,$position);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1495
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1496 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1497 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1498 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1499 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1500
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1501 unless (exists $alignments{$alignment_location}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1502 $alignments{$alignment_location}->{seq_id} = $id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1503 $alignments{$alignment_location}->{alignment_score} = $alignment_score;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1504 $alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1505 $alignments{$alignment_location}->{index} = $index;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1506 $alignments{$alignment_location}->{chromosome} = $chromosome;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1507 $alignments{$alignment_location}->{position} = $position;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1508 $alignments{$alignment_location}->{MD_tag} = $MD_tag;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1509 $alignments{$alignment_location}->{CIGAR} = $cigar;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1510 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1511
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1512 my $newline = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1513 if ($newline){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1514 chomp $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1515 my ($seq_id) = split (/\t/,$newline);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1516 $fhs[$index]->{last_seq_id} = $seq_id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1517 $fhs[$index]->{last_line} = $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1518 if ($seq_id eq $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1519 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1520 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1521 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1522 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1523 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1524 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1525 $fhs[$index]->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1526 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1527 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1528 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1529 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1530
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1531 ### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1532 if ($alignment_ambiguous == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1533 $counting{unsuitable_sequence_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1534 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1535 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1536 # print "$ambiguous_read_output\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1537
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1538 if ($ambiguous){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1539 return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.txt if '--ambiguous' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1540 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1541 elsif ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1542 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1543 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1544 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1545 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1546 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1547 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1548
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1549 ### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1550 unless(%alignments){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1551 $counting{no_single_alignment_found}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1552 # my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1553 # print "$unmapped_read_output\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1554 if ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1555 return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1556 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1557 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1558 return 0; # default
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1559 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1560 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1561
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1562 #######################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1563
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1564 ### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1565 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1566 ### alignment score we are discarding the sequence altogether.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1567 ### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1568 ### opening (5) and extending (3 per bp) the gap.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1569
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1570 #######################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1571
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1572 my $methylation_call_params; # hash reference which will store all information we need for the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1573 my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1574
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1575 ### print contents of %alignments for debugging
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1576 # if (scalar keys %alignments > 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1577 # print "\n******\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1578 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1579 # print "Loc: $alignment_location\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1580 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1581 # print "AS: $alignments{$alignment_location}->{alignment_score}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1582 # print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1583 # print "Index $alignments{$alignment_location}->{index}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1584 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1585 # print "pos: $alignments{$alignment_location}->{position}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1586 # print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1587 # }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1588 # print "\n******\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1589 # }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1590
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1591 ### if there is only 1 entry in the hash with we accept it as the best alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1592 if (scalar keys %alignments == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1593 for my $unique_best_alignment (keys %alignments){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1594 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1595 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1596 $methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1597 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1598 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1599 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1600 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1601 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1602 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1603
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1604 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1605 ### we boot the sequence altogether
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1606 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1607 my $best_alignment_score;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1608 my $best_alignment_location;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1609 foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1610 # print "$alignments{$alignment_location}->{alignment_score}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1611 unless (defined $best_alignment_score){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1612 $best_alignment_score = $alignments{$alignment_location}->{alignment_score};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1613 $best_alignment_location = $alignment_location;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1614 # print "setting best alignment score: $best_alignment_score\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1615 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1616 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1617 ### if the second best alignment has the same alignment score as the first one, the sequence will get booted
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1618 if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1619 # warn "Same alignment score, the sequence will get booted!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1620 $sequence_fails = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1621 last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1622 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1623 ### else we are going to store the best alignment for further processing
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1624 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1625 $methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1626 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1627 $methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1628 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1629 $methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1630 $methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1631 $methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1632 last; # exiting after processing the second alignment since the sequence produced a unique best alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1633 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1634 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1635 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1636 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1637 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1638 die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1639 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1640
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1641 ### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1642 if ($sequence_fails == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1643 $counting{unsuitable_sequence_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1644
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1645 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1646 # my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1647 # print OUT "$ambiguous_read_output\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1648
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1649 if ($ambiguous){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1650 return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1651 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1652 elsif ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1653 return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1654 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1655 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1656 return 0; # => exits to next sequence (default)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1657 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1658 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1659
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1660 ### --DIRECTIONAL
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1661 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1662 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1663 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1664 if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1665 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1666 $counting{alignments_rejected_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1667 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1668 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1669 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1670
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1671 ### If the sequence has not been rejected so far it has a unique best alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1672 $counting{unique_best_alignment_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1673
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1674 ### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1675 extract_corresponding_genomic_sequence_single_end_bowtie2 ($identifier,$methylation_call_params);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1676
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1677 ### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1678 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1679 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1680 $counting{genomic_sequence_could_not_be_extracted_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1681 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1682 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1683
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1684
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1685 ### otherwise we are set to perform the actual methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1686 $methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1687 print_bisulfite_mapping_result_single_end_bowtie2 ($identifier,$sequence,$methylation_call_params,$quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1688 return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1689 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1690
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1691
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1692 sub determine_number_of_transliterations_performed{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1693 my ($sequence,$read_conversion) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1694 my $number_of_transliterations;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1695 if ($read_conversion eq 'CT'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1696 $number_of_transliterations = $sequence =~ tr/C/T/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1697 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1698 elsif ($read_conversion eq 'GA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1699 $number_of_transliterations = $sequence =~ tr/G/A/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1700 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1701 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1702 die "Read conversion mode of the read was not specified $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1703 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1704 return $number_of_transliterations;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1705 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1706
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1707 sub decide_whether_single_end_alignment_is_valid{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1708 my ($index,$identifier) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1709
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1710 # extracting from Bowtie 1 format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1711 my ($id,$strand) = (split (/\t/,$fhs[$index]->{last_line}))[0,1];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1712
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1713 ### ensuring that the entry is the correct sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1714 if (($id eq $fhs[$index]->{last_seq_id}) and ($id eq $identifier)){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1715 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1716 ### sensible alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1717 my $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1718 ### If the orientation was correct can we move on
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1719 if ($orientation == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1720 return 1; ### 1st possibility for a sequence to pass
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1721 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1722 ### If the alignment was in the wrong orientation we need to read in a new line
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1723 elsif($orientation == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1724 my $newline = $fhs[$index]->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1725 if ($newline){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1726 ($id,$strand) = (split (/\t/,$newline))[0,1];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1727
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1728 ### ensuring that the next entry is still the correct sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1729 if ($id eq $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1730 ### checking orientation again
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1731 $orientation = ensure_sensical_alignment_orientation_single_end ($index,$strand);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1732 ### If the orientation was correct can we move on
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1733 if ($orientation == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1734 $fhs[$index]->{last_seq_id} = $id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1735 $fhs[$index]->{last_line} = $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1736 return 1; ### 2nd possibility for a sequence to pass
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1737 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1738 ### If the alignment was in the wrong orientation again we need to read in yet another new line and store it in @fhs
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1739 elsif ($orientation == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1740 $newline = $fhs[$index]->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1741 if ($newline){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1742 my ($seq_id) = split (/\t/,$newline);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1743 ### check if the next line still has the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1744 ### the same fields of the just read next entry
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1745 die "Same seq ID 3 or more times in a row!(should be 2 max) $!" if ($seq_id eq $identifier);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1746 $fhs[$index]->{last_seq_id} = $seq_id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1747 $fhs[$index]->{last_line} = $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1748 return 0; # not processing anything this round as the alignment currently stored in last_line was in the wrong orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1749 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1750 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1751 # assigning undef to last_seq_id and last_line (end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1752 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1753 $fhs[$index]->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1754 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1755 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1756 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1757 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1758 die "The orientation of the alignment must be either correct or incorrect\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1759 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1760 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1761 ### the sequence we just read in is already the next sequence to be analysed -> store it in @fhs
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1762 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1763 $fhs[$index]->{last_seq_id} = $id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1764 $fhs[$index]->{last_line} = $newline;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1765 return 0; # processing the new alignment result only in the next round
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1766 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1767 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1768 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1769 # assigning undef to last_seq_id and last_line (end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1770 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1771 $fhs[$index]->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1772 return 0; # not processing anything as the alignment currently stored in last_line was in the wrong orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1773 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1774 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1775 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1776 die "The orientation of the alignment must be either correct or incorrect\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1777 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1778 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1779 ### the sequence stored in @fhs as last_line is already the next sequence to be analysed -> analyse next round
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1780 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1781 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1782 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1783 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1784 #########################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1785 ### BOWTIE 1 | PAIRED-END
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1786 #########################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1787
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1788 sub check_bowtie_results_paired_ends{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1789 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1790
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1791 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1792 unless ($quality_value_1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1793 $quality_value_1 = 'I'x(length$sequence_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1794 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1795 unless ($quality_value_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1796 $quality_value_2 = 'I'x(length$sequence_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1797 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1798
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1799 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1800
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1801 my %mismatches = ();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1802 ### reading from the bowtie output files to see if this sequence pair aligned to a bisulfite converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1803
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1804
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1805 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1806 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1807 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1808 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignment to the complementary
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1809 ### strands are not being reported by specifying --directional
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1810
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1811 foreach my $index (0,3,1,2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1812 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1813 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1814 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1815 if ($fhs[$index]->{last_seq_id} eq $identifier) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1816 # print "$identifier\n$fhs[$index]->{last_seq_id}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1817
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1818 ##################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1819 ### STEP I Processing the entry which is stored in last_line_1 and last_line_2 ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1820 ##################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1821 my $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1822 ### sequences can fail at this point if there was only 1 alignment in the wrong orientation, or if there were 2 aligments both in the wrong
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1823 ### orientation. We only continue to extract useful information about this alignment if 1 was returned
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1824 if ($valid_alignment_found == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1825 ### Bowtie outputs which made it this far are in the correct orientation, so we can continue to analyse the alignment itself.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1826 ### we store the useful information in %mismatches
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1827 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1828 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1829 chomp $mismatch_info_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1830 chomp $mismatch_info_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1831
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1832 ### need to extract the chromosome number from the bowtie output (which is either XY_CT_converted or XY_GA_converted
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1833 my ($chromosome_1,$chromosome_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1834 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1835 $chromosome_1 = $mapped_chromosome_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1836 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1837 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1838 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1839 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1840 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1841 $chromosome_2 = $mapped_chromosome_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1842 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1843 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1844 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1845 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1846
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1847 ### Now extracting the number of mismatches to the converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1848 my $number_of_mismatches_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1849 my $number_of_mismatches_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1850 if ($mismatch_info_1 eq ''){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1851 $number_of_mismatches_1 = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1852 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1853 elsif ($mismatch_info_1 =~ /^\d/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1854 my @mismatches = split (/,/,$mismatch_info_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1855 $number_of_mismatches_1 = scalar @mismatches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1856 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1857 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1858 die "Something weird is going on with the mismatch field\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1859 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1860 if ($mismatch_info_2 eq ''){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1861 $number_of_mismatches_2 = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1862 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1863 elsif ($mismatch_info_2 =~ /^\d/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1864 my @mismatches = split (/,/,$mismatch_info_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1865 $number_of_mismatches_2 = scalar @mismatches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1866 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1867 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1868 die "Something weird is going on with the mismatch field\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1869 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1870 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1871 my $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1872 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1873 die "Position 1 is higher than position 2" if ($position_1 > $position_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1874 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1875 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1876 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1877 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1878 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1879 ### number for the found alignment)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1880 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1881 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1882 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1883 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1884 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1885 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1886 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1887 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1888 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1889 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1890 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1891 ###################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1892 ### STEP II Now reading in the next 2 lines from the bowtie filehandle. If there are 2 next lines in the alignments filehandle it can either ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1893 ### be a second alignment of the same sequence pair or a new sequence pair. In any case we will just add it to last_line_1 and last_line _2. ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1894 ### If it is the alignment of the next sequence pair, 0 will be returned as $valid_alignment_found, so it will not be processed any further in ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1895 ### this round ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1896 ###################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1897 my $newline_1 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1898 my $newline_2 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1899
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1900 if ($newline_1 and $newline_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1901 my ($seq_id_1) = split (/\t/,$newline_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1902 my ($seq_id_2) = split (/\t/,$newline_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1903
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1904 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1905 $fhs[$index]->{last_seq_id} = $seq_id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1906 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1907 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1908 $fhs[$index]->{last_seq_id} = $seq_id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1909 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1910 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1911 die "Either read 1 or read 2 needs to end on '/1'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1912 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1913
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1914 $fhs[$index]->{last_line_1} = $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1915 $fhs[$index]->{last_line_2} = $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1916 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1917 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1918 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1919 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1920 $fhs[$index]->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1921 $fhs[$index]->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1922 next; # jumping to the next index
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1923 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1924 ### Now processing the entry we just stored in last_line_1 and last_line_2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1925 $valid_alignment_found = decide_whether_paired_end_alignment_is_valid($index,$identifier);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1926 ### only processing the alignment further if 1 was returned. 0 will be returned either if the alignment is already the next sequence pair to
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1927 ### be analysed or if it was a second alignment of the current sequence pair but in the wrong orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1928 if ($valid_alignment_found == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1929 ### we store the useful information in %mismatches
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1930 ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,7];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1931 ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,7];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1932 chomp $mismatch_info_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1933 chomp $mismatch_info_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1934 ### need to extract the chromosome number from the bowtie output (which is either _CT_converted or _GA_converted)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1935 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1936 $chromosome_1 = $mapped_chromosome_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1937 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1938 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1939 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1940 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1941 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1942 $chromosome_2 = $mapped_chromosome_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1943 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1944 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1945 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1946 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1947
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1948 $number_of_mismatches_1='';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1949 $number_of_mismatches_2='';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1950 ### Now extracting the number of mismatches to the converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1951 if ($mismatch_info_1 eq ''){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1952 $number_of_mismatches_1 = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1953 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1954 elsif ($mismatch_info_1 =~ /^\d/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1955 my @mismatches = split (/,/,$mismatch_info_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1956 $number_of_mismatches_1 = scalar @mismatches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1957 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1958 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1959 die "Something weird is going on with the mismatch field\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1960 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1961 if ($mismatch_info_2 eq ''){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1962 $number_of_mismatches_2 = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1963 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1964 elsif ($mismatch_info_2 =~ /^\d/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1965 my @mismatches = split (/,/,$mismatch_info_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1966 $number_of_mismatches_2 = scalar @mismatches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1967 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1968 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1969 die "Something weird is going on with the mismatch field\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1970 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1971 ### To decide whether a sequence pair has a unique best alignment we will look at the lowest sum of mismatches from both alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1972 $sum_of_mismatches = $number_of_mismatches_1+$number_of_mismatches_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1973 ### creating a composite location variable from $chromosome and $position and storing the alignment information in a temporary hash table
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1974 die "position 1 is greater than position 2" if ($position_1 > $position_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1975 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1976 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1977 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1978 ### strand) were methylated and therefore protected. It is not needed to overwrite the same positional entry with a second entry for the same
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1979 ### location (the genomic sequence extraction and methylation would not be affected by this, only the thing which would change is the index
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1980 ### number for the found alignment)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1981 unless (exists $mismatches{$sum_of_mismatches}->{$alignment_location}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1982 $mismatches{$sum_of_mismatches}->{$alignment_location}->{seq_id}=$id_1; # either is fine
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1983 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_1}=$bowtie_sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1984 $mismatches{$sum_of_mismatches}->{$alignment_location}->{bowtie_sequence_2}=$bowtie_sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1985 $mismatches{$sum_of_mismatches}->{$alignment_location}->{index}=$index;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1986 $mismatches{$sum_of_mismatches}->{$alignment_location}->{chromosome}=$chromosome_1; # either is fine
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1987 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_1}=$position_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1988 $mismatches{$sum_of_mismatches}->{$alignment_location}->{start_seq_2}=$position_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1989 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_1} = $number_of_mismatches_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1990 $mismatches{$sum_of_mismatches}->{$alignment_location}->{number_of_mismatches_2} = $number_of_mismatches_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1991 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1992 ###############################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1993 ### STEP III Now reading in two more lines. These have to be the next entry and we will just add assign them to last_line_1 and last_line_2 ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1994 ###############################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1995 $newline_1 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1996 $newline_2 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1997
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1998 if ($newline_1 and $newline_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
1999 my ($seq_id_1) = split (/\t/,$newline_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2000 my ($seq_id_2) = split (/\t/,$newline_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2001
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2002 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2003 $fhs[$index]->{last_seq_id} = $seq_id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2004 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2005 if ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2006 $fhs[$index]->{last_seq_id} = $seq_id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2007 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2008 $fhs[$index]->{last_line_1} = $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2009 $fhs[$index]->{last_line_2} = $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2010 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2011 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2012 # assigning undef to last_seq_id and both last_lines and jumping to the next index (end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2013 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2014 $fhs[$index]->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2015 $fhs[$index]->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2016 next; # jumping to the next index
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2017 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2018 ### within the 2nd sequence pair alignment in correct orientation found
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2019 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2020 ### within the 1st sequence pair alignment in correct orientation found
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2021 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2022 ### still within the (last_seq_id eq identifier) condition
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2023 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2024 ### still within foreach index loop
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2025 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2026 ### if there was no single alignment found for a certain sequence we will continue with the next sequence in the sequence file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2027 unless(%mismatches){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2028 $counting{no_single_alignment_found}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2029 return 1; ### We will print this sequence out as unmapped sequence if --un unmapped.out has been specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2030 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2031 ### Going to use the variable $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2032 my $sequence_pair_fails = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2033 ### Declaring an empty hash reference which will store all information we need for the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2034 my $methylation_call_params; # hash reference!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2035 ### We are now looking if there is a unique best alignment for a certain sequence. This means we are sorting in ascending order and look at the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2036 ### sequence with the lowest amount of mismatches. If there is only one single best position we are going to store the alignment information in the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2037 ### meth_call variables, if there are multiple hits with the same amount of (lowest) mismatches we are discarding the sequence altogether
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2038 foreach my $mismatch_number (sort {$a<=>$b} keys %mismatches){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2039 #dev print "Number of mismatches: $mismatch_number\t$identifier\t$sequence_1\t$sequence_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2040 foreach my $entry (keys (%{$mismatches{$mismatch_number}}) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2041 #dev print "$mismatch_number\t$entry\t$mismatches{$mismatch_number}->{$entry}->{index}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2042 # print join("\t",$mismatch_number,$mismatches{$mismatch_number}->{$entry}->{seq_id},$sequence,$mismatches{$mismatch_number}->{$entry}->{bowtie_sequence},$mismatches{$mismatch_number}->{$entry}->{chromosome},$mismatches{$mismatch_number}->{$entry}->{position},$mismatches{$mismatch_number}->{$entry}->{index}),"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2043 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2044 if (scalar keys %{$mismatches{$mismatch_number}} == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2045 # print "Unique best alignment for sequence pair $sequence_1\t$sequence_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2046 for my $unique_best_alignment (keys %{$mismatches{$mismatch_number}}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2047 $methylation_call_params->{$identifier}->{seq_id} = $identifier;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2048 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2049 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2050 $methylation_call_params->{$identifier}->{chromosome} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{chromosome};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2051 $methylation_call_params->{$identifier}->{start_seq_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2052 $methylation_call_params->{$identifier}->{start_seq_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2053 $methylation_call_params->{$identifier}->{alignment_end} = ($mismatches{$mismatch_number}->{$unique_best_alignment}->{start_seq_2}+length($mismatches{$mismatch_number}->{$unique_best_alignment}->{bowtie_sequence_2}));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2054 $methylation_call_params->{$identifier}->{index} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{index};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2055 $methylation_call_params->{$identifier}->{number_of_mismatches_1} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2056 $methylation_call_params->{$identifier}->{number_of_mismatches_2} = $mismatches{$mismatch_number}->{$unique_best_alignment}->{number_of_mismatches_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2057 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2058 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2059 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2060 $sequence_pair_fails = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2061 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2062 ### after processing the alignment with the lowest number of mismatches we exit
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2063 last;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2064 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2065 ### skipping the sequence completely if there were multiple alignments with the same amount of lowest mismatches found at different positions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2066 if ($sequence_pair_fails == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2067 $counting{unsuitable_sequence_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2068 if ($ambiguous){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2069 return 2; # => exits to next sequence pair, and prints both seqs out to multiple_alignments_1 and -2 if --ambiguous has been specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2070 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2071 if ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2072 return 1; # => exits to next sequence pair, and prints both seqs out to unmapped_1 and _2 if --un has been specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2073 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2074 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2075 return 0; # => exits to next sequence (default)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2076 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2077 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2078
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2079 ### --DIRECTIONAL
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2080 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2081 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2082 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2083 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2084 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2085 $counting{alignments_rejected_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2086 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2087 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2088 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2089
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2090 ### If the sequence has not been rejected so far it does have a unique best alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2091 $counting{unique_best_alignment_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2092 extract_corresponding_genomic_sequence_paired_ends($identifier,$methylation_call_params);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2093
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2094 ### check test to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2095 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2096 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2097 $counting{genomic_sequence_could_not_be_extracted_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2098 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2099 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2100 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2101 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2102 $counting{genomic_sequence_could_not_be_extracted_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2103 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2104 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2105
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2106 ### otherwise we are set to perform the actual methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2107 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2108 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2109
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2110 print_bisulfite_mapping_results_paired_ends($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2111 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2112 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2113
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2114 #########################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2115 ### BOWTIE 2 | PAIRED-END
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2116 #########################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2117
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2118 sub check_bowtie_results_paired_ends_bowtie2{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2119 my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2120
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2121 ### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2122 unless ($quality_value_1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2123 $quality_value_1 = 'I'x(length$sequence_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2124 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2125
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2126 unless ($quality_value_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2127 $quality_value_2 = 'I'x(length$sequence_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2128 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2129
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2130
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2131 # print "$identifier\n$fhs[0]->{last_seq_id}\n$fhs[1]->{last_seq_id}\n$fhs[2]->{last_seq_id}\n$fhs[3]->{last_seq_id}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2132
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2133
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2134 my %alignments;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2135 my $alignment_ambiguous = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2136
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2137 ### reading from the Bowtie 2 output filehandles
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2138
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2139 ### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2140 ### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2141 ### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2142 ### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2143 ### strands are not being reported when '--directional' is specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2144
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2145 foreach my $index (0,3,1,2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2146 ### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2147 next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2148
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2149 ### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2150 if ($fhs[$index]->{last_seq_id} eq $identifier) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2151
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2152 my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2153 my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2154 # print "Index: $index\t$fhs[$index]->{last_line_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2155 # print "Index: $index\t$fhs[$index]->{last_line_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2156 # print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2157 # print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2158 $id_1 =~ s/\/1$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2159 $id_2 =~ s/\/2$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2160
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2161 # SAM format specifications for Bowtie 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2162 # (1) Name of read that aligned
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2163 # (2) Sum of all applicable flags. Flags relevant to Bowtie are:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2164 # 1 The read is one of a pair
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2165 # 2 The alignment is one end of a proper paired-end alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2166 # 4 The read has no reported alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2167 # 8 The read is one of a pair and has no reported alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2168 # 16 The alignment is to the reverse reference strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2169 # 32 The other mate in the paired-end alignment is aligned to the reverse reference strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2170 # 64 The read is mate 1 in a pair
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2171 # 128 The read is mate 2 in a pair
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2172 # 256 The read has multiple mapping states
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2173 # (3) Name of reference sequence where alignment occurs (unmapped reads have a *)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2174 # (4) 1-based offset into the forward reference strand where leftmost character of the alignment occurs (0 for unmapped reads)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2175 # (5) Mapping quality (255 means MAPQ is not available)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2176 # (6) CIGAR string representation of alignment (* if unavailable)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2177 # (7) Name of reference sequence where mate's alignment occurs. Set to = if the mate's reference sequence is the same as this alignment's, or * if there is no mate.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2178 # (8) 1-based offset into the forward reference strand where leftmost character of the mate's alignment occurs. Offset is 0 if there is no mate.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2179 # (9) Inferred fragment size. Size is negative if the mate's alignment occurs upstream of this alignment. Size is 0 if there is no mate.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2180 # (10) Read sequence (reverse-complemented if aligned to the reverse strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2181 # (11) ASCII-encoded read qualities (reverse-complemented if the read aligned to the reverse strand). The encoded quality values are on the Phred quality scale and the encoding is ASCII-offset by 33 (ASCII char !), similarly to a FASTQ file.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2182 # (12) Optional fields. Fields are tab-separated. bowtie2 outputs zero or more of these optional fields for each alignment, depending on the type of the alignment:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2183 # AS:i:<N> Alignment score. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2184 # XS:i:<N> Alignment score for second-best alignment. Can be negative. Can be greater than 0 in --local mode (but not in --end-to-end mode). Only present if the SAM record is for an aligned read and more than one alignment was found for the read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2185 # YS:i:<N> Alignment score for opposite mate in the paired-end alignment. Only present if the SAM record is for a read that aligned as part of a paired-end alignment.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2186 # XN:i:<N> The number of ambiguous bases in the reference covering this alignment. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2187 # XM:i:<N> The number of mismatches in the alignment. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2188 # XO:i:<N> The number of gap opens, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2189 # XG:i:<N> The number of gap extensions, for both read and reference gaps, in the alignment. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2190 # NM:i:<N> The edit distance; that is, the minimal number of one-nucleotide edits (substitutions, insertions and deletions) needed to transform the read string into the reference string. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2191 # YF:Z:<N> String indicating reason why the read was filtered out. See also: Filtering. Only appears for reads that were filtered out.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2192 # MD:Z:<S> A string representation of the mismatched reference bases in the alignment. See SAM format specification for details. Only present if SAM record is for an aligned read.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2193
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2194 ### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2195 ### We can store the next alignment and move on to the next Bowtie 2 instance
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2196 if ($flag_1 == 77 and $flag_2 == 141){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2197 ## reading in the next alignment, which must be the next sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2198 my $newline_1 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2199 my $newline_2 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2200
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2201 if ($newline_1 and $newline_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2202 chomp $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2203 chomp $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2204 my ($seq_id_1) = split (/\t/,$newline_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2205 my ($seq_id_2) = split (/\t/,$newline_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2206 $seq_id_1 =~ s/\/1$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2207 $seq_id_2 =~ s/\/2$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2208 $fhs[$index]->{last_seq_id} = $seq_id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2209 $fhs[$index]->{last_line_1} = $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2210 $fhs[$index]->{last_line_2} = $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2211
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2212 # print "current sequence ($identifier) did not map, reading in next sequence\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2213 # print "$index\t$fhs[$index]->{last_seq_id}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2214 # print "$index\t$fhs[$index]->{last_line_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2215 # print "$index\t$fhs[$index]->{last_line_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2216 next; # next instance
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2217 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2218 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2219 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2220 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2221 $fhs[$index]->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2222 $fhs[$index]->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2223 next;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2224 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2225 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2226
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2227 ### If there are one or more proper alignments we can extract the chromosome number
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2228 my ($chromosome_1,$chromosome_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2229 if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2230 $chromosome_1 = $mapped_chromosome_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2231 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2232 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2233 die "Chromosome number extraction failed for $mapped_chromosome_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2234 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2235 if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2236 $chromosome_2 = $mapped_chromosome_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2237 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2238 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2239 die "Chromosome number extraction failed for $mapped_chromosome_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2240 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2241
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2242 die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2243
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2244 ### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2245 my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2246
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2247 my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2248 my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2249
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2250 foreach (11..$#fields_1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2251 if ($fields_1[$_] =~ /AS:i:(.*)/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2252 $alignment_score_1 = $1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2253 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2254 elsif ($fields_1[$_] =~ /XS:i:(.*)/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2255 $second_best_1 = $1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2256 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2257 elsif ($fields_1[$_] =~ /MD:Z:(.*)/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2258 $MD_tag_1 = $1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2259 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2260 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2261
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2262 foreach (11..$#fields_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2263 if ($fields_2[$_] =~ /AS:i:(.*)/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2264 $alignment_score_2 = $1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2265 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2266 elsif ($fields_2[$_] =~ /XS:i:(.*)/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2267 $second_best_2 = $1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2268 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2269 elsif ($fields_2[$_] =~ /MD:Z:(.*)/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2270 $MD_tag_2 = $1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2271 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2272 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2273
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2274 die "Failed to extract alignment score 1 ($alignment_score_1) and MD tag ($MD_tag_1)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_1 and defined $MD_tag_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2275 die "Failed to extract alignment score 2 ($alignment_score_2) and MD tag ($MD_tag_2)!\nlast alignment 1: $fhs[$index]->{last_line_1}\nlast alignment 2: $fhs[$index]->{last_line_2}\n" unless (defined $alignment_score_2 and defined $MD_tag_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2276
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2277 # warn "First read 1 alignment score is: '$alignment_score_1'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2278 # warn "First read 2 alignment score is: '$alignment_score_2'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2279 # warn "MD tag 1 is: '$MD_tag_1'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2280 # warn "MD tag 2 is: '$MD_tag_2'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2281
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2282 ### To decide whether a sequence pair has a unique best alignment we will look at the highest sum of alignment scores from both alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2283 my $sum_of_alignment_scores_1 = $alignment_score_1 + $alignment_score_2 ;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2284 # print "sum of alignment scores: $sum_of_alignment_scores_1\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2285
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2286 if (defined $second_best_1 and defined $second_best_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2287 my $sum_of_alignment_scores_second_best = $second_best_1 + $second_best_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2288 # warn "Second best alignment_score_1 is: '$second_best_1'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2289 # warn "Second best alignment_score_2 is: '$second_best_2'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2290 # warn "Second best alignment sum of alignment scores is: '$sum_of_alignment_scores_second_best'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2291
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2292 # If the first alignment score for the first read pair is the same as the alignment score of the second best hit we are going to boot this sequence pair altogether
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2293 if ($sum_of_alignment_scores_1 == $sum_of_alignment_scores_second_best){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2294 $alignment_ambiguous = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2295 # print "This read will be chucked (AS==XS detected)!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2296
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2297 ## need to read and discard all additional ambiguous reads until we reach the next sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2298 until ($fhs[$index]->{last_seq_id} ne $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2299 my $newline_1 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2300 my $newline_2 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2301 if ($newline_1 and $newline_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2302 chomp $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2303 chomp $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2304 my ($seq_id_1) = split (/\t/,$newline_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2305 my ($seq_id_2) = split (/\t/,$newline_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2306 $seq_id_1 =~ s/\/1$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2307 $seq_id_2 =~ s/\/2$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2308 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2309
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2310 $fhs[$index]->{last_seq_id} = $seq_id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2311 $fhs[$index]->{last_line_1} = $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2312 $fhs[$index]->{last_line_2} = $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2313 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2314 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2315 # assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2316 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2317 $fhs[$index]->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2318 $fhs[$index]->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2319 last; # break free if the end of the alignment output was reached
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2320 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2321 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2322 # if ($fhs[$index]->{last_seq_id}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2323 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2324 # }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2325 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2326 else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2327
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2328 my $alignment_location;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2329 if ($position_1 <= $position_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2330 $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2331 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2332 elsif($position_2 < $position_1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2333 $alignment_location = join(":",$chromosome_1,$position_2,$position_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2334 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2335
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2336 ### If a sequence aligns to exactly the same location twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2337 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2338 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2339 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2340
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2341 unless (exists $alignments{$alignment_location}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2342 $alignments{$alignment_location}->{seq_id} = $id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2343 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2344 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2345 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2346 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2347 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2348 $alignments{$alignment_location}->{index} = $index;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2349 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2350 $alignments{$alignment_location}->{position_1} = $position_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2351 $alignments{$alignment_location}->{position_2} = $position_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2352 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2353 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2354 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2355 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2356 $alignments{$alignment_location}->{flag_1} = $flag_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2357 $alignments{$alignment_location}->{flag_2} = $flag_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2358 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2359 # warn "added best of several alignments to \%alignments hash\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2360
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2361 ### now reading and discarding all (inferior) alignments of this read pair until we hit the next sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2362 until ($fhs[$index]->{last_seq_id} ne $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2363 my $newline_1 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2364 my $newline_2 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2365 if ($newline_1 and $newline_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2366 chomp $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2367 chomp $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2368 my ($seq_id_1) = split (/\t/,$newline_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2369 my ($seq_id_2) = split (/\t/,$newline_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2370 $seq_id_1 =~ s/\/1$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2371 $seq_id_2 =~ s/\/2$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2372 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2373
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2374 $fhs[$index]->{last_seq_id} = $seq_id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2375 $fhs[$index]->{last_line_1} = $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2376 $fhs[$index]->{last_line_2} = $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2377 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2378 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2379 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2380 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2381 $fhs[$index]->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2382 $fhs[$index]->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2383 last; # break free if the end of the alignment output was reached
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2384 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2385 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2386 # if($fhs[$index]->{last_seq_id}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2387 # warn "Index: $index\tThis Seq-ID is $identifier, skipped all other alignments until the next ID was reached which is: $fhs[$index]->{last_seq_id}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2388 # }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2389 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2390 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2391 else{ # there is no second best hit, so we can just store this one and read in the next sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2392
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2393 my $alignment_location = join(":",$chromosome_1,$position_1,$position_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2394 # print "$alignment_location\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2395 ### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2396 ### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2397 ### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2398 ### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 3, i.e. OT and OB
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2399
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2400 unless (exists $alignments{$alignment_location}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2401 $alignments{$alignment_location}->{seq_id} = $id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2402 $alignments{$alignment_location}->{alignment_score_1} = $alignment_score_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2403 $alignments{$alignment_location}->{alignment_score_2} = $alignment_score_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2404 $alignments{$alignment_location}->{sum_of_alignment_scores} = $sum_of_alignment_scores_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2405 $alignments{$alignment_location}->{bowtie_sequence_1} = $bowtie_sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2406 $alignments{$alignment_location}->{bowtie_sequence_2} = $bowtie_sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2407 $alignments{$alignment_location}->{index} = $index;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2408 $alignments{$alignment_location}->{chromosome} = $chromosome_1; # either is fine
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2409 $alignments{$alignment_location}->{position_1} = $position_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2410 $alignments{$alignment_location}->{position_2} = $position_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2411 $alignments{$alignment_location}->{mismatch_info_1} = $MD_tag_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2412 $alignments{$alignment_location}->{mismatch_info_2} = $MD_tag_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2413 $alignments{$alignment_location}->{CIGAR_1} = $cigar_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2414 $alignments{$alignment_location}->{CIGAR_2} = $cigar_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2415 $alignments{$alignment_location}->{flag_1} = $flag_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2416 $alignments{$alignment_location}->{flag_2} = $flag_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2417 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2418
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2419 # warn "added unique alignment to \%alignments hash\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2420
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2421 # Now reading and storing the next read pair
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2422 my $newline_1 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2423 my $newline_2 = $fhs[$index]->{fh}-> getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2424 if ($newline_1 and $newline_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2425 chomp $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2426 chomp $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2427 # print "$newline_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2428 # print "$newline_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2429 my ($seq_id_1) = split (/\t/,$newline_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2430 my ($seq_id_2) = split (/\t/,$newline_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2431 $seq_id_1 =~ s/\/1$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2432 $seq_id_2 =~ s/\/2$//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2433 # print "New Seq IDs:\t$seq_id_1\t$seq_id_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2434
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2435 $fhs[$index]->{last_seq_id} = $seq_id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2436 $fhs[$index]->{last_line_1} = $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2437 $fhs[$index]->{last_line_2} = $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2438
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2439 if ($seq_id_1 eq $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2440 die "Sequence with ID $identifier did not have a second best alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2441 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2442 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2443 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2444 # assigning undef to last_seq_id and last_line_1 and _2 and jumping to the next index (end of Bowtie 2 output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2445 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2446 $fhs[$index]->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2447 $fhs[$index]->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2448 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2449 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2450 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2451 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2452
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2453 ### if the read produced several ambiguous alignments for a single instance of Bowtie 2 we can return already now. If --ambiguous was specified the read sequence will be printed out in FastQ format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2454 if ($alignment_ambiguous == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2455 $counting{unsuitable_sequence_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2456 ### report that the sequence pair has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2457 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2458 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2459 # print "$ambiguous_read_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2460 # print "$ambiguous_read_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2461
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2462 if ($ambiguous){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2463 return 2; # => exits to next sequence pair, and prints it out to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2464 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2465 elsif ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2466 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2467 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2468 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2469 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2470 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2471 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2472
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2473 ### if no alignment was found for a certain sequence at all we continue with the next sequence in the sequence file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2474 unless (%alignments){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2475 $counting{no_single_alignment_found}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2476
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2477 # my $unmapped_read_1 = join("\t",$identifier.'/1','77','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2478 # my $unmapped_read_2 = join("\t",$identifier.'/2','141','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2479 # print "$unmapped_read_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2480 # print "$unmapped_read_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2481 if ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2482 return 1; # => exits to next sequence pair, and prints it out to _unmapped_reads_1.txt and _unmapped_read_2.txt if '--unmapped' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2483 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2484 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2485 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2486 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2487 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2488
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2489 #######################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2490
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2491 ### If the sequence pair was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2492 ### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2493 ### alignment score we are discarding the sequence pair altogether.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2494 ### For end-to-end alignments the maximum alignment score is 0, each mismatch receives a penalty of 6, and each gap receives penalties for opening (5)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2495 ### and extending (3 per bp) the gap.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2496
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2497 #######################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2498
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2499 ### Declaring an empty hash reference which will store all information we need for the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2500 my $methylation_call_params; # hash reference
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2501 my $sequence_pair_fails = 0; # using $sequence_pair_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2502
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2503 ### print contents of %alignments for debugging
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2504 ## if (scalar keys %alignments >= 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2505 # print "\n******\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2506 # foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2507 # print "Loc: $alignment_location\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2508 # print "ID: $alignments{$alignment_location}->{seq_id}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2509 # print "AS_1: $alignments{$alignment_location}->{alignment_score_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2510 # print "AS_2: $alignments{$alignment_location}->{alignment_score_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2511 # print "Seq_1: $alignments{$alignment_location}->{bowtie_sequence_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2512 # print "Seq_2: $alignments{$alignment_location}->{bowtie_sequence_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2513 # print "Index $alignments{$alignment_location}->{index}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2514 # print "Chr: $alignments{$alignment_location}->{chromosome}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2515 # print "Pos_1: $alignments{$alignment_location}->{position_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2516 # print "Pos_2: $alignments{$alignment_location}->{position_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2517 # print "CIGAR_1: $alignments{$alignment_location}->{CIGAR_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2518 # print "CIGAR_2: $alignments{$alignment_location}->{CIGAR_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2519 # print "MD_1: $alignments{$alignment_location}->{mismatch_info_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2520 # print "MD_2: $alignments{$alignment_location}->{mismatch_info_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2521 # print "Flag 1: $alignments{$alignment_location}->{flag_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2522 # print "Flag 2: $alignments{$alignment_location}->{flag_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2523 # }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2524 # print "\n******\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2525 # }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2526
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2527 ### if there is only 1 entry in the %alignments hash we accept it as the best alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2528 if (scalar keys %alignments == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2529 for my $unique_best_alignment (keys %alignments){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2530 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$unique_best_alignment}->{bowtie_sequence_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2531 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$unique_best_alignment}->{bowtie_sequence_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2532 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2533 $methylation_call_params->{$identifier}->{position_1} = $alignments{$unique_best_alignment}->{position_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2534 $methylation_call_params->{$identifier}->{position_2} = $alignments{$unique_best_alignment}->{position_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2535 $methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2536 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$unique_best_alignment}->{alignment_score_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2537 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$unique_best_alignment}->{alignment_score_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2538 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$unique_best_alignment}->{sum_of_alignment_scores};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2539 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$unique_best_alignment}->{mismatch_info_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2540 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$unique_best_alignment}->{mismatch_info_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2541 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$unique_best_alignment}->{CIGAR_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2542 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$unique_best_alignment}->{CIGAR_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2543 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$unique_best_alignment}->{flag_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2544 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$unique_best_alignment}->{flag_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2545 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2546 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2547
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2548 ### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2549 ### we boot the sequence pair altogether)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2550 elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2551 my $best_sum_of_alignment_scores;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2552 my $best_alignment_location;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2553 foreach my $alignment_location (sort {$alignments{$b}->{sum_of_alignment_scores} <=> $alignments{$a}->{sum_of_alignment_scores}} keys %alignments){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2554 # print "$alignments{$alignment_location}->{sum_of_alignment_scores}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2555 unless (defined $best_sum_of_alignment_scores){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2556 $best_sum_of_alignment_scores = $alignments{$alignment_location}->{sum_of_alignment_scores};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2557 $best_alignment_location = $alignment_location;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2558 # print "setting best alignment score to: $best_sum_of_alignment_scores\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2559 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2560 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2561 ### if the second best alignment has the same sum of alignment scores as the first one, the sequence pair will get booted
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2562 if ($alignments{$alignment_location}->{sum_of_alignment_scores} == $best_sum_of_alignment_scores){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2563 # warn "Same sum of alignment scores for 2 different alignments, the sequence pair will get booted!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2564 $sequence_pair_fails = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2565 last; # exiting since we know that the sequence has ambiguous alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2566 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2567 ### else we are going to store the best alignment for further processing
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2568 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2569 $methylation_call_params->{$identifier}->{bowtie_sequence_1} = $alignments{$best_alignment_location}->{bowtie_sequence_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2570 $methylation_call_params->{$identifier}->{bowtie_sequence_2} = $alignments{$best_alignment_location}->{bowtie_sequence_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2571 $methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2572 $methylation_call_params->{$identifier}->{position_1} = $alignments{$best_alignment_location}->{position_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2573 $methylation_call_params->{$identifier}->{position_2} = $alignments{$best_alignment_location}->{position_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2574 $methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2575 $methylation_call_params->{$identifier}->{alignment_score_1} = $alignments{$best_alignment_location}->{alignment_score_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2576 $methylation_call_params->{$identifier}->{alignment_score_2} = $alignments{$best_alignment_location}->{alignment_score_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2577 $methylation_call_params->{$identifier}->{sum_of_alignment_scores} = $alignments{$best_alignment_location}->{sum_of_alignment_scores};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2578 $methylation_call_params->{$identifier}->{mismatch_info_1} = $alignments{$best_alignment_location}->{mismatch_info_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2579 $methylation_call_params->{$identifier}->{mismatch_info_2} = $alignments{$best_alignment_location}->{mismatch_info_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2580 $methylation_call_params->{$identifier}->{CIGAR_1} = $alignments{$best_alignment_location}->{CIGAR_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2581 $methylation_call_params->{$identifier}->{CIGAR_2} = $alignments{$best_alignment_location}->{CIGAR_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2582 $methylation_call_params->{$identifier}->{flag_1} = $alignments{$best_alignment_location}->{flag_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2583 $methylation_call_params->{$identifier}->{flag_2} = $alignments{$best_alignment_location}->{flag_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2584 last; # exiting since the sequence produced a unique best alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2585 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2586 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2587 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2588 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2589 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2590 die "There are too many potential hits for this sequence pair (1-4 expected, but found: '",scalar keys %alignments,"')\n";;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2591 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2592
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2593 ### skipping the sequence completely if there were multiple alignments with the same best sum of alignment scores at different positions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2594 if ($sequence_pair_fails == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2595 $counting{unsuitable_sequence_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2596
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2597 ### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2598 # my $ambiguous_read_1 = join("\t",$identifier.'/1','256','*','0','0','*','*','0','0',$sequence_1,$quality_value_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2599 # my $ambiguous_read_2 = join("\t",$identifier.'/2','256','*','0','0','*','*','0','0',$sequence_2,$quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2600 # print "$ambiguous_read_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2601 # print "$ambiguous_read_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2602
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2603 if ($ambiguous){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2604 return 2; # => exits to next sequence pair, and prints it out (in FastQ format) to _ambiguous_reads_1.txt and _ambiguous_reads_2.txt if '--ambiguous' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2605 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2606 elsif ($unmapped){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2607 return 1; # => exits to next sequence pair, and prints it out (in FastQ format) to _unmapped_reads_1.txt and _unmapped_reads_2.txt if '--unmapped' but not '--ambiguous' was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2608 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2609 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2610 return 0; # => exits to next sequence pair (default)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2611 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2612 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2613
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2614 ### --DIRECTIONAL
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2615 ### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2616 ### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2617 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2618 if ( ($methylation_call_params->{$identifier}->{index} == 1) or ($methylation_call_params->{$identifier}->{index} == 2) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2619 # warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2620 $counting{alignments_rejected_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2621 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2622 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2623 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2624
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2625 ### If the sequence pair has not been rejected so far it does have a unique best alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2626 $counting{unique_best_alignment_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2627 extract_corresponding_genomic_sequence_paired_ends_bowtie2($identifier,$methylation_call_params);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2628
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2629 ### check to see if the genomic sequences we extracted has the same length as the observed sequences +2, and only then we perform the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2630 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1}) != length($sequence_1)+2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2631 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_1}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2632 $counting{genomic_sequence_could_not_be_extracted_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2633 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2634 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2635 if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}) != length($sequence_2)+2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2636 warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{start_seq_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2637 $counting{genomic_sequence_could_not_be_extracted_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2638 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2639 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2640
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2641 ### now we are set to perform the actual methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2642 $methylation_call_params->{$identifier}->{methylation_call_1} = methylation_call($identifier,$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{read_conversion_1});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2643 $methylation_call_params->{$identifier}->{methylation_call_2} = methylation_call($identifier,$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{read_conversion_2});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2644 # print "$methylation_call_params->{$identifier}->{read_conversion_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2645 # print " $sequence_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2646 # print "$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2647 # print " $methylation_call_params->{$identifier}->{methylation_call_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2648
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2649 print_bisulfite_mapping_results_paired_ends_bowtie2($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2650 return 0; ## otherwise 1 will be returned by default, which would print the sequence pair to unmapped_1 and _2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2651 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2652
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2653 ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2654
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2655 sub decide_whether_paired_end_alignment_is_valid{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2656 my ($index,$identifier) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2657 my ($id_1,$strand_1,$mapped_chromosome_1,$position_1,$bowtie_sequence_1,$mismatch_info_1) = (split (/\t/,$fhs[$index]->{last_line_1},-1))[0,1,2,3,4,7];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2658 my ($id_2,$strand_2,$mapped_chromosome_2,$position_2,$bowtie_sequence_2,$mismatch_info_2) = (split (/\t/,$fhs[$index]->{last_line_2},-1))[0,1,2,3,4,7];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2659 chomp $mismatch_info_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2660 chomp $mismatch_info_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2661 my $seq_id_1 = $id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2662 my $seq_id_2 = $id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2663 $seq_id_1 =~ s/\/1$//; # removing the read /1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2664 $seq_id_2 =~ s/\/1$//; # removing the read /1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2665
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2666 ### ensuring that the current entry is the correct sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2667 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2668 ### checking the orientation of the alignment. We need to discriminate between 8 different conditions, however only 4 of them are theoretically
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2669 ### sensible alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2670 my $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2671 ### If the orientation was correct can we move on
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2672 if ($orientation == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2673 return 1; ### 1st possibility for A SEQUENCE-PAIR TO PASS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2674 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2675 ### If the alignment was in the wrong orientation we need to read in two new lines
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2676 elsif($orientation == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2677 my $newline_1 = $fhs[$index]->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2678 my $newline_2 = $fhs[$index]->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2679 if ($newline_1 and $newline_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2680 ### extract detailed information about the alignment again (from $newline_1 and $newline_2 this time)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2681 ($id_1,$strand_1) = (split (/\t/,$newline_1))[0,1];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2682 ($id_2,$strand_2) = (split (/\t/,$newline_2))[0,1];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2683
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2684 my $seqid;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2685 $seq_id_1 = $id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2686 $seq_id_2 = $id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2687 # we need to capture the first read (ending on /1)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2688 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2689 $seqid = $seq_id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2690 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2691 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2692 $seqid = $seq_id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2693 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2694 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2695 die "One of the two reads needs to end on /1!!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2696 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2697
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2698 ### ensuring that the next entry is still the correct sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2699 if ($seq_id_1 eq $identifier or $seq_id_2 eq $identifier){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2700 ### checking orientation again
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2701 $orientation = ensure_sensical_alignment_orientation_paired_ends ($index,$id_1,$strand_1,$id_2,$strand_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2702 ### If the orientation was correct can we move on
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2703 if ($orientation == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2704 ### Writing the current sequence to last_line_1 and last_line_2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2705 $fhs[$index]->{last_seq_id} = $seqid;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2706 $fhs[$index]->{last_line_1} = $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2707 $fhs[$index]->{last_line_2} = $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2708 return 1; ### 2nd possibility for a SEQUENCE-PAIR TO PASS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2709 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2710 ### If the alignment was in the wrong orientation again we need to read in yet another 2 new lines and store them in @fhs (this must be
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2711 ### the next entry)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2712 elsif ($orientation == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2713 $newline_1 = $fhs[$index]->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2714 $newline_2 = $fhs[$index]->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2715 if ($newline_1 and $newline_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2716 ($seq_id_1) = split (/\t/,$newline_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2717 ($seq_id_2) = split (/\t/,$newline_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2718
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2719 $seqid = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2720 if ($seq_id_1 =~ s/\/1$//){ # removing the read /1 tag
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2721 $seqid = $seq_id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2722 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2723 elsif ($seq_id_2 =~ s/\/1$//){ # removing the read /1 tag
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2724 $seqid = $seq_id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2725 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2726 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2727 die "One of the two reads needs to end on /1!!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2728 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2729
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2730 ### check if the next 2 lines still have the same seq ID (must not happen), and if not overwrite the current seq-ID and bowtie output with
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2731 ### the same fields of the just read next entry
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2732 die "Same seq ID 3 or more times in a row!(should be 2 max)" if ($seqid eq $identifier);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2733 $fhs[$index]->{last_seq_id} = $seqid;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2734 $fhs[$index]->{last_line_1} = $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2735 $fhs[$index]->{last_line_2} = $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2736 return 0; # not processing anything this round as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2737 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2738 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2739 ### assigning undef to last_seq_id and last_line (end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2740 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2741 $fhs[$index]->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2742 $fhs[$index]->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2743 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2744 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2745 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2746 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2747 die "The orientation of the alignment must be either correct or incorrect\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2748 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2749 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2750 ### the sequence pair we just read in is already the next sequence pair to be analysed -> store it in @fhs
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2751 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2752 $fhs[$index]->{last_seq_id} = $seqid;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2753 $fhs[$index]->{last_line_1} = $newline_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2754 $fhs[$index]->{last_line_2} = $newline_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2755 return 0; # processing the new alignment result only in the next round
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2756 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2757 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2758 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2759 # assigning undef to last_seq_id and both last_lines (end of bowtie output)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2760 $fhs[$index]->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2761 $fhs[$index]->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2762 $fhs[$index]->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2763 return 0; # not processing anything as the alignment currently stored in last_line_1 and _2 was in the wrong orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2764 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2765 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2766 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2767 die "The orientation of the alignment must be either correct or incorrect\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2768 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2769 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2770 ### the sequence pair stored in @fhs as last_line_1 and last_line_2 is already the next sequence pair to be analysed -> analyse next round
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2771 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2772 return 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2773 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2774 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2775
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2776 ### EXTRACT GENOMIC SEQUENCE | BOWTIE 1 | PAIRED-END
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2777
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2778 sub extract_corresponding_genomic_sequence_paired_ends {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2779 my ($sequence_identifier,$methylation_call_params) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2780 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2781 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2782 my $alignment_read_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2783 my $alignment_read_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2784 my $read_conversion_info_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2785 my $read_conversion_info_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2786 my $genome_conversion;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2787
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2788 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at oone of the ends so that we can also make a CpG, CHG or CHH methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2789 ### if the C happens to be at the first or last position of the actually observed sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2790 my $non_bisulfite_sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2791 my $non_bisulfite_sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2792
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2793 ### all alignments reported by bowtie have the + alignment first and the - alignment as the second one irrespective of whether read 1 or read 2 was
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2794 ### the + alignment. We however always read in sequences read 1 then read 2, so if read 2 is the + alignment we need to swap the extracted genomic
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2795 ### sequences around!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2796 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2797 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2798 ### [Index 0, sequence originated from (converted) forward strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2799 $counting{CT_GA_CT_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2800 $alignment_read_1 = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2801 $alignment_read_2 = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2802 $read_conversion_info_1 = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2803 $read_conversion_info_2 = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2804 $genome_conversion = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2805 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2806 ### for hits on the forward strand we need to capture 2 extra bases at the 3' end
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2807
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2808 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ##CHH change
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2809
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2810 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2811 ### As the second conversion is GA we need to capture 1 base 3', so that it is a 5' base after reverse complementation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2812 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{start_seq_2}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+1){ ## CHH change to +1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2813
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2814 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2815 ### the reverse strand sequence needs to be reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2816 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2817 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2818 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2819 $non_bisulfite_sequence_2 = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2820 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2821 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2822
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2823 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2824 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2825 ### [Index 1, sequence originated from complementary to (converted) reverse strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2826 $counting{GA_CT_GA_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2827 $alignment_read_1 = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2828 $alignment_read_2 = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2829 $read_conversion_info_1 = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2830 $read_conversion_info_2 = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2831 $genome_conversion = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2832
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2833 ### SEQUENCE 1 (this is always the forward hit, in this case it is read 1)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2834 ### as we need to make the methylation call for the base 5' of the first base (GA conversion!) we need to capture 2 extra bases at the 5' end
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2835 if ($methylation_call_params->{$sequence_identifier}->{start_seq_1}-1 > 0){ ## CHH change to -1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2836 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{start_seq_1}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH change to -2/+2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2837 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2838 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2839 $non_bisulfite_sequence_1 = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2840 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2841
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2842 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is read 2)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2843 ### As we are doing a CT comparison for the reverse strand we are taking 2 bases extra at the 5' end, so it is a 3' base after reverse complementation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2844 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to -2/+2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2845 ### the reverse strand sequence needs to be reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2846 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2847 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2848
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2849 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2850 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2851 ### [Index 2, sequence originated from the complementary to (converted) forward strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2852 $counting{GA_CT_CT_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2853 $alignment_read_1 = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2854 $alignment_read_2 = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2855 $read_conversion_info_1 = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2856 $read_conversion_info_2 = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2857 $genome_conversion = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2858
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2859 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2860 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2861 ### As read 1 is GA converted we need to capture 2 extra 3' bases which will be 2 extra 5' base after reverse complementation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2862 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH change to +2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2863 ### the reverse strand sequence needs to be reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2864 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2865
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2866 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2867 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2868 ### Read 2 is CT converted so we need to capture 2 extra 3' bases
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2869 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > ($methylation_call_params->{$sequence_identifier}->{start_seq_1})+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+1){ ## CHH change to +1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2870 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1}),length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ## CHH changed from +1 to +2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2871 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2872 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2873 $non_bisulfite_sequence_2 = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2874 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2875 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2876
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2877 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2878 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2879 ### [Index 3, sequence originated from the (converted) reverse strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2880 $counting{CT_GA_GA_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2881 $alignment_read_1 = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2882 $alignment_read_2 = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2883 $read_conversion_info_1 = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2884 $read_conversion_info_2 = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2885 $genome_conversion = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2886
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2887 ### Here we switch the sequence information round!! non_bisulfite_sequence_1 will later correspond to the read 1!!!!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2888 ### SEQUENCE 1 (this is always the forward hit, in this case it is READ 2), read 1 is in - orientation on the reverse strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2889 ### As read 1 is CT converted we need to capture 2 extra 5' bases which will be 2 extra 3' base after reverse complementation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2890 if ( ($methylation_call_params->{$sequence_identifier}->{start_seq_2}-1) > 0){ ## CHH changed to -1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2891 $non_bisulfite_sequence_1 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_2})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_2})+2); ### CHH changed to -2/+2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2892 ### the reverse strand sequence needs to be reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2893 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2894 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2895 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2896 $non_bisulfite_sequence_1 = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2897 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2898
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2899 ### SEQUENCE 2 (this will always be on the reverse strand, in this case it is READ 1)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2900 ### non_bisulfite_sequence_2 will later correspond to the read 2!!!!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2901 ### Read 2 is GA converted so we need to capture 2 extra 5' bases
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2902 $non_bisulfite_sequence_2 = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},($methylation_call_params->{$sequence_identifier}->{start_seq_1})-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence_1})+2); ### CHH changed to -2/+2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2903 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2904 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2905 die "Too many bowtie result filehandles\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2906 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2907 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2908 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2909
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2910 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2911 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2912 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2913 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2914 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2915 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2916 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2917 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2918
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2919 ### EXTRACT GENOMIC SEQUENCE BOWTIE 2 | PAIRED-END
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2920
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2921 sub extract_corresponding_genomic_sequence_paired_ends_bowtie2{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2922 my ($sequence_identifier,$methylation_call_params) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2923 ### A bisulfite sequence pair for 1 location in the genome can theoretically be on any of the 4 possible converted strands. We are also giving the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2924 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2925
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2926 my $cigar_1 = $methylation_call_params->{$sequence_identifier}->{CIGAR_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2927 my $cigar_2 = $methylation_call_params->{$sequence_identifier}->{CIGAR_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2928 my $flag_1 = $methylation_call_params->{$sequence_identifier}->{flag_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2929 my $flag_2 = $methylation_call_params->{$sequence_identifier}->{flag_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2930 # print "$cigar_1\t$cigar_2\t$flag_1\t$flag_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2931 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2932 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2933
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2934 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2935 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2936 my $alignment_read_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2937 my $alignment_read_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2938 my $read_conversion_info_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2939 my $read_conversion_info_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2940 my $genome_conversion;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2941
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2942 ### Now extracting the same sequence from the mouse genomic sequence, +2 extra bases at one of the ends so that we can also make a CpG, CHG or CHH methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2943 ### if the C happens to be at the last position of the actually observed sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2944 my $non_bisulfite_sequence_1 = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2945 my $non_bisulfite_sequence_2 = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2946
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2947 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2948 my $pos_1 = $methylation_call_params->{$sequence_identifier}->{position_1}-1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2949 my $pos_2 = $methylation_call_params->{$sequence_identifier}->{position_2}-1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2950
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2951 # parsing CIGAR 1 string
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2952 my @len_1 = split (/\D+/,$cigar_1); # storing the length per operation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2953 my @ops_1 = split (/\d+/,$cigar_1); # storing the operation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2954 shift @ops_1; # remove the empty first element
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2955 die "CIGAR 1 string contained a non-matching number of lengths and operations\n" unless (scalar @len_1 == scalar @ops_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2956 # parsing CIGAR 2 string
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2957 my @len_2 = split (/\D+/,$cigar_2); # storing the length per operation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2958 my @ops_2 = split (/\d+/,$cigar_2); # storing the operation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2959 shift @ops_2; # remove the empty first element
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2960 die "CIGAR 2 string contained a non-matching number of lengths and operations\n" unless (scalar @len_2 == scalar @ops_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2961
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2962 my $indels_1 = 0; # addiong these to the hemming distance value (needed for the NM field in the final SAM output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2963 my $indels_2 = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2964
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2965 ### Extracting read 1 genomic sequence ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2966
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2967 # extracting 2 additional bp at the 5' end (read 1)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2968 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2969 # checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2970 unless ( ($pos_1-2) > 0){# exiting with en empty genomic sequence otherwise
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2971 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2972 return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2973 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2974 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1-2,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2975 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2976
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2977 foreach (0..$#len_1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2978 if ($ops_1[$_] eq 'M'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2979 # extracting genomic sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2980 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,$len_1[$_]);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2981 # warn "$non_bisulfite_sequence_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2982 # adjusting position
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2983 $pos_1 += $len_1[$_];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2984 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2985 elsif ($ops_1[$_] eq 'I'){ # insertion in the read sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2986 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2987 $non_bisulfite_sequence_1 .= 'N' x $len_1[$_];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2988 # warn "$non_bisulfite_sequence_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2989 # position doesn't need adjusting
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2990 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2991 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2992 elsif ($ops_1[$_] eq 'D'){ # deletion in the read sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2993 # we do not add any genomic sequence but only adjust the position
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2994 # warn "Just adjusting the position by: ",$len_1[$_],"bp\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2995 $pos_1 += $len_1[$_];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2996 $indels_1 += $len_1[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2997 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2998 elsif($cigar_1 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
2999 die "The CIGAR 1 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3000 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3001 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3002 die "The CIGAR 1 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_1\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3003 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3004 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3005
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3006 ### 3' end of read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3007 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3008 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3009 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_1+2){# exiting with en empty genomic sequence otherwise
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3010 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3011 return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3012 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3013 $non_bisulfite_sequence_1 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_1,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3014 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3015
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3016
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3017 ### Extracting read 2 genomic sequence ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3018
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3019 ### 5' end of read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3020 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3021 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3022 unless ( ($pos_2-2) >= 0){# exiting with en empty genomic sequence otherwise
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3023 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3024 return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3025 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3026 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2-2,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3027 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3028
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3029 foreach (0..$#len_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3030 if ($ops_2[$_] eq 'M'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3031 # extracting genomic sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3032 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,$len_2[$_]);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3033 # warn "$non_bisulfite_sequence_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3034 # adjusting position
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3035 $pos_2 += $len_2[$_];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3036 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3037 elsif ($ops_2[$_] eq 'I'){ # insertion in the read sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3038 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3039 $non_bisulfite_sequence_2 .= 'N' x $len_2[$_];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3040 # warn "$non_bisulfite_sequence_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3041 # position doesn't need adjusting
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3042 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3043 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3044 elsif ($ops_2[$_] eq 'D'){ # deletion in the read sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3045 # we do not add any genomic sequence but only adjust the position
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3046 # warn "Just adjusting the position by: ",$len_2[$_],"bp\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3047 $pos_2 += $len_2[$_];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3048 $indels_2 += $len_2[$_]; # adding to $indels_1 to determine the hemming distance (= single base mismatches, insertions or deletions) for the SAM output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3049 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3050 elsif($cigar_2 =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3051 die "The CIGAR 2 string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3052 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3053 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3054 die "The CIGAR 2 string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3055 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3056 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3057
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3058 ### 3' end of read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3059 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3060 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3061 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos_2+2){# exiting with en empty genomic sequence otherwise
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3062 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3063 return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3064 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3065 $non_bisulfite_sequence_2 .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos_2,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3066 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3067
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3068 ### all paired-end alignments reported by Bowtie 2 have the Read 1 alignment first and the Read 2 alignment as the second one irrespective of whether read 1 or read 2 was
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3069 ### the + alignment. We also read in sequences read 1 then read 2 so they should correspond perfectly
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3070
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3071 ### results from CT converted read 1 plus GA converted read 2 vs. CT converted genome (+/- orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3072 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3073 ### [Index 0, sequence originated from (converted) forward strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3074 $counting{CT_GA_CT_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3075 $alignment_read_1 = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3076 $alignment_read_2 = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3077 $read_conversion_info_1 = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3078 $read_conversion_info_2 = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3079 $genome_conversion = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3080 ### Read 1 is always the forward hit
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3081 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3082 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3083 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3084
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3085 ### results from GA converted read 1 plus CT converted read 2 vs. GA converted genome (+/- orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3086 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3087 ### [Index 1, sequence originated from complementary to (converted) bottom strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3088 $counting{GA_CT_GA_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3089 $alignment_read_1 = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3090 $alignment_read_2 = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3091 $read_conversion_info_1 = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3092 $read_conversion_info_2 = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3093 $genome_conversion = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3094 ### Read 1 is always the forward hit
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3095 ### Read 2 is will always on the reverse strand, so it needs to be reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3096 $non_bisulfite_sequence_2 = reverse_complement($non_bisulfite_sequence_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3097 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3098
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3099 ### results from GA converted read 1 plus CT converted read 2 vs. CT converted genome (-/+ orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3100 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3101 ### [Index 2, sequence originated from the complementary to (converted) top strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3102 $counting{GA_CT_CT_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3103 $alignment_read_1 = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3104 $alignment_read_2 = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3105 $read_conversion_info_1 = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3106 $read_conversion_info_2 = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3107 $genome_conversion = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3108
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3109 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3110 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3111 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3112
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3113 ### results from CT converted read 1 plus GA converted read 2 vs. GA converted genome (-/+ orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3114 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3115 ### [Index 3, sequence originated from the (converted) reverse strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3116 $counting{CT_GA_GA_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3117 $alignment_read_1 = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3118 $alignment_read_2 = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3119 $read_conversion_info_1 = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3120 $read_conversion_info_2 = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3121 $genome_conversion = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3122 ### Read 1 (the reverse strand) genomic sequence needs to be reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3123 $non_bisulfite_sequence_1 = reverse_complement($non_bisulfite_sequence_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3124 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3125 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3126 die "Too many bowtie result filehandles\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3127 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3128 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3129 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3130
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3131 $methylation_call_params->{$sequence_identifier}->{alignment_read_1} = $alignment_read_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3132 $methylation_call_params->{$sequence_identifier}->{alignment_read_2} = $alignment_read_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3133 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3134 $methylation_call_params->{$sequence_identifier}->{read_conversion_1} = $read_conversion_info_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3135 $methylation_call_params->{$sequence_identifier}->{read_conversion_2} = $read_conversion_info_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3136 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_1} = $non_bisulfite_sequence_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3137 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence_2} = $non_bisulfite_sequence_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3138 ## the end position of a read is stored in $pos
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3139 $methylation_call_params->{$sequence_identifier}->{end_position_1} = $pos_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3140 $methylation_call_params->{$sequence_identifier}->{end_position_2} = $pos_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3141 $methylation_call_params->{$sequence_identifier}->{indels_1} = $indels_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3142 $methylation_call_params->{$sequence_identifier}->{indels_2} = $indels_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3143 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3144
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3145 ##########################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3146 ### PRINT SINGLE END RESULTS: Bowtie 1 ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3147 ##########################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3148
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3149 sub print_bisulfite_mapping_result_single_end{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3150 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3151
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3152 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3153 if ($phred64){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3154 $quality_value = convert_phred64_quals_to_phred33($quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3155 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3156 elsif ($solexa){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3157 $quality_value = convert_solexa_quals_to_phred33($quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3158 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3159
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3160 ### We will add +1 bp to the starting position of single-end reads, as Bowtie 1 reports the index and not the bp position.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3161 $methylation_call_params->{$identifier}->{position} += 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3162
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3163 ### writing every uniquely mapped read and its methylation call to the output file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3164 if ($vanilla){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3165 my $bowtie1_output = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_strand},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{position},$methylation_call_params->{$identifier}->{end_position},$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{methylation_call},$methylation_call_params->{$identifier}->{read_conversion},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3166 print OUT "$bowtie1_output\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3167 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3168 else{ # SAM output, default since Bismark v1.0.0
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3169 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3170 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3171 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3172
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3173 ##########################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3174 ### PRINT SINGLE END RESULTS: Bowtie 2 ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3175 ##########################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3176
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3177 sub print_bisulfite_mapping_result_single_end_bowtie2{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3178 my ($identifier,$sequence,$methylation_call_params,$quality_value)= @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3179
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3180 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3181 if ($phred64){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3182 $quality_value = convert_phred64_quals_to_phred33($quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3183 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3184 elsif ($solexa){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3185 $quality_value = convert_solexa_quals_to_phred33($quality_value);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3186 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3187
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3188 ### writing every mapped read and its methylation call to the SAM output file (unmapped and ambiguous reads were already printed)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3189 single_end_SAM_output($identifier,$sequence,$methylation_call_params,$quality_value); # at the end of the script
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3190 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3191
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3192 ##########################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3193 ### PRINT PAIRED END ESULTS: Bowtie 1 ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3194 ##########################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3195
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3196 sub print_bisulfite_mapping_results_paired_ends{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3197 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3198
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3199 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3200 if ($phred64){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3201 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3202 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3203 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3204 elsif ($solexa){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3205 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3206 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3207 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3208
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3209 ### We will add +1 bp to the start position of paired-end reads, as Bowtie 1 reports the index and not the bp position. (End position is already 1-based)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3210 $methylation_call_params->{$identifier}->{start_seq_1} += 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3211
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3212 ### writing every single aligned read and its methylation call to the output file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3213 if ($vanilla){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3214 my $bowtie1_output_paired_end = join("\t",$identifier,$methylation_call_params->{$identifier}->{alignment_read_1},$methylation_call_params->{$identifier}->{chromosome},$methylation_call_params->{$identifier}->{start_seq_1},$methylation_call_params->{$identifier}->{alignment_end},$sequence_1,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_1},$methylation_call_params->{$identifier}->{methylation_call_1},$sequence_2,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence_2},$methylation_call_params->{$identifier}->{methylation_call_2},$methylation_call_params->{$identifier}->{read_conversion_1},$methylation_call_params->{$identifier}->{genome_conversion},$quality_value_1,$quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3215 print OUT "$bowtie1_output_paired_end\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3216 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3217 else{ # SAM output, default since Bismark v1.0.0
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3218 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3219 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3220
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3221 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3222
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3223 ##########################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3224 ### PRINT PAIRED END ESULTS: Bowtie 2 ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3225 ##########################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3226
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3227 sub print_bisulfite_mapping_results_paired_ends_bowtie2{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3228 my ($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2)= @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3229
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3230 ### we will output the FastQ quality in Sanger encoding (Phred 33 scale)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3231 if ($phred64){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3232 $quality_value_1 = convert_phred64_quals_to_phred33($quality_value_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3233 $quality_value_2 = convert_phred64_quals_to_phred33($quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3234 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3235 elsif ($solexa){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3236 $quality_value_1 = convert_solexa_quals_to_phred33($quality_value_1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3237 $quality_value_2 = convert_solexa_quals_to_phred33($quality_value_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3238 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3239
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3240 ### writing every single aligned read and its methylation call to the output file (unmapped and ambiguous reads were already printed)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3241 paired_end_SAM_output($identifier,$sequence_1,$sequence_2,$methylation_call_params,$quality_value_1,$quality_value_2); # at the end of the script
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3242
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3243 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3244
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3245
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3246 sub convert_phred64_quals_to_phred33{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3247
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3248 my $qual = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3249 my @quals = split (//,$qual);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3250 my @new_quals;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3251
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3252 foreach my $index (0..$#quals){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3253 my $phred_score = convert_phred64_quality_string_into_phred_score ($quals[$index]);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3254 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3255 $new_quals[$index] = $phred33_quality_string;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3256 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3257
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3258 my $phred33_quality = join ("",@new_quals);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3259 return $phred33_quality;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3260 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3261
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3262 sub convert_solexa_quals_to_phred33{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3263
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3264 my $qual = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3265 my @quals = split (//,$qual);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3266 my @new_quals;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3267
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3268 foreach my $index (0..$#quals){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3269 my $phred_score = convert_solexa_pre1_3_quality_string_into_phred_score ($quals[$index]);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3270 my $phred33_quality_string = convert_phred_score_into_phred33_quality_string ($phred_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3271 $new_quals[$index] = $phred33_quality_string;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3272 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3273
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3274 my $phred33_quality = join ("",@new_quals);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3275 return $phred33_quality;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3276 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3277
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3278 sub convert_phred_score_into_phred33_quality_string{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3279 my $qual = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3280 $qual = chr($qual+33);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3281 return $qual;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3282 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3283
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3284 sub convert_phred64_quality_string_into_phred_score{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3285 my $string = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3286 my $qual = ord($string)-64;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3287 return $qual;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3288 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3289
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3290 sub convert_solexa_pre1_3_quality_string_into_phred_score{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3291 ### We will just use 59 as the offset here as all Phred Scores between 10 and 40 look exactly the same, there is only a minute difference for values between 0 and 10
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3292 my $string = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3293 my $qual = ord($string)-59;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3294 return $qual;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3295 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3296
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3297
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3298 sub extract_corresponding_genomic_sequence_single_end {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3299 my ($sequence_identifier,$methylation_call_params) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3300 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3301 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3302
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3303 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3304 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3305 my $alignment_strand;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3306 my $read_conversion_info;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3307 my $genome_conversion;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3308 ### Also extracting the corresponding genomic sequence, +2 extra bases at the end so that we can also make a CpG methylation call and
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3309 ### in addition make differential calls for Cs non-CpG context, which will now be divided into CHG and CHH methylation,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3310 ### if the C happens to be at the last position of the actually observed sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3311 my $non_bisulfite_sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3312 ### depending on the conversion we want to make need to capture 1 extra base at the 3' end
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3313
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3314 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3315 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3316 ### [Index 0, sequence originated from (converted) forward strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3317 $counting{CT_CT_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3318 $alignment_strand = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3319 $read_conversion_info = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3320 $genome_conversion = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3321
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3322 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3323 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## CHH changed to +1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3324 ### + 2 extra base at the 3' end
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3325 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3326 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3327 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3328 $non_bisulfite_sequence = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3329 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3330 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3331
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3332 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3333 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3334 ### [Index 1, sequence originated from (converted) reverse strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3335 $counting{CT_GA_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3336 $alignment_strand = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3337 $read_conversion_info = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3338 $genome_conversion = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3339
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3340 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3341 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to -2 # 02 02 2012 Changed this to >= from >
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3342 ### Extracting 2 extra 5' bases on forward strand which will become 2 extra 3' bases after reverse complementation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3343 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3344 ## reverse complement!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3345 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3346 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3347 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3348 $non_bisulfite_sequence = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3349 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3350 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3351
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3352 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3353 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3354 ### [Index 2, sequence originated from complementary to (converted) forward strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3355 $counting{GA_CT_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3356 $alignment_strand = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3357 $read_conversion_info = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3358 $genome_conversion = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3359
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3360 ### +2 extra bases on the forward strand 3', which will become 2 extra 5' bases after reverse complementation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3361 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3362 if (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) > $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+1){ ## changed to +1 on 02 02 2012
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3363 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position},length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to +2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3364 ## reverse complement!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3365 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3366 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3367 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3368 $non_bisulfite_sequence = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3369 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3370 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3371
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3372 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3373 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3374 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3375 $counting{GA_GA_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3376 $alignment_strand = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3377 $read_conversion_info = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3378 $genome_conversion = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3379
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3380 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3381 if ($methylation_call_params->{$sequence_identifier}->{position}-2 >= 0){ ## CHH changed to +2 # 02 02 2012 Changed this to >= from >
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3382 ### +2 extra base at the 5' end as we are nominally checking the converted reverse strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3383 $non_bisulfite_sequence = substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$methylation_call_params->{$sequence_identifier}->{position}-2,length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence})+2); ## CHH changed to -2/+2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3384 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3385 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3386 $non_bisulfite_sequence = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3387 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3388 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3389 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3390 die "Too many bowtie result filehandles\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3391 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3392
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3393 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3394 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3395 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3396 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3397
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3398 ### at this point we can also determine the end position of a read
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3399 $methylation_call_params->{$sequence_identifier}->{end_position} = $methylation_call_params->{$sequence_identifier}->{position}+length($methylation_call_params->{$sequence_identifier}->{bowtie_sequence});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3400 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3401
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3402
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3403 sub extract_corresponding_genomic_sequence_single_end_bowtie2{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3404 my ($sequence_identifier,$methylation_call_params) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3405
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3406 my $MD_tag = $methylation_call_params->{$sequence_identifier}->{mismatch_info};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3407 my $cigar = $methylation_call_params->{$sequence_identifier}->{CIGAR};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3408
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3409 ### A bisulfite sequence for 1 location in the genome can theoretically be any of the 4 possible converted strands. We are also giving the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3410 ### sequence a 'memory' of the conversion we are expecting which we will need later for the methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3411
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3412 ### the alignment_strand information is needed to determine which strand of the genomic sequence we are comparing the read against,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3413 ### the read_conversion information is needed to know whether we are looking for C->T or G->A substitutions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3414 my $alignment_strand;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3415 my $read_conversion_info;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3416 my $genome_conversion;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3417 ### We are now extracting the corresponding genomic sequence, +2 extra bases at the end (or start) so that we can also make a CpG methylation call and
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3418 ### in addition make differential calls for Cs in CHG or CHH context if the C happens to be at the last (or first) position of the actually observed sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3419 my $non_bisulfite_sequence = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3420
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3421 ### Positions in SAM format are 1 based, so we need to subract 1 when getting substrings
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3422 my $pos = $methylation_call_params->{$sequence_identifier}->{position}-1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3423
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3424 # parsing CIGAR string
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3425 my @len = split (/\D+/,$cigar); # storing the length per operation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3426 my @ops = split (/\d+/,$cigar); # storing the operation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3427 shift @ops; # remove the empty first element
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3428 die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3429
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3430 ### If the sequence aligns best as CT converted reads vs. GA converted genome (OB, index 1) or GA converted reads vs. GA converted genome (CTOB, index 3)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3431 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 1) or ($methylation_call_params->{$sequence_identifier}->{index} == 3) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3432 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3433 unless ( ($pos-2) >= 0){ # exiting with en empty genomic sequence otherwise
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3434 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3435 return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3436 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3437 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos-2,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3438 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3439 my $indels = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3440
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3441 foreach (0..$#len){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3442 if ($ops[$_] eq 'M'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3443 #extracting genomic sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3444 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,$len[$_]);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3445 # adjusting position
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3446 $pos += $len[$_];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3447 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3448 elsif ($ops[$_] eq 'I'){ # insertion in the read sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3449 # we simply add padding Ns instead of finding genomic sequence. This will not be used to infer methylation calls
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3450 $non_bisulfite_sequence .= 'N' x $len[$_];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3451 # warn "$non_bisulfite_sequence\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3452 # position doesn't need to be adjusting
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3453 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3454 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3455 elsif ($ops[$_] eq 'D'){ # deletion in the read sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3456 # we do not add any genomic sequence but only adjust the position
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3457 $pos += $len[$_];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3458 $indels += $len[$_]; # adding this to $indels so we can determine the hemming distance for the SAM output (= single-base substitutions (mismatches, insertions, deletions)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3459 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3460 elsif($cigar =~ tr/[NSHPX=]//){ # if these (for standard mapping) illegal characters exist we die
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3461 die "The CIGAR string contained illegal CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3462 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3463 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3464 die "The CIGAR string contained undefined CIGAR operations in addition to 'M', 'I' and 'D': $cigar\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3465 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3466 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3467
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3468 ### If the sequence aligns best as CT converted reads vs. CT converted genome (OT, index 0) or GA converted reads vs. CT converted genome (CTOT, index 2)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3469 if ( ($methylation_call_params->{$sequence_identifier}->{index} == 0) or ($methylation_call_params->{$sequence_identifier}->{index} == 2) ){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3470 ## checking if the substring will be valid or if we can't extract the sequence because we are right at the edge of a chromosome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3471 unless (length($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}}) >= $pos+2){ # exiting with en empty genomic sequence otherwise
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3472 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3473 return;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3474 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3475 $non_bisulfite_sequence .= substr ($chromosomes{$methylation_call_params->{$sequence_identifier}->{chromosome}},$pos,2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3476 # print "$methylation_call_params->{$sequence_identifier}->{bowtie_sequence}\n$non_bisulfite_sequence\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3477 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3478
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3479
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3480
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3481 ### results from CT converted read vs. CT converted genome (+ orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3482 if ($methylation_call_params->{$sequence_identifier}->{index} == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3483 ### [Index 0, sequence originated from (converted) forward strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3484 $counting{CT_CT_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3485 $alignment_strand = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3486 $read_conversion_info = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3487 $genome_conversion = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3488 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3489
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3490 ### results from CT converted reads vs. GA converted genome (- orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3491 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3492 ### [Index 1, sequence originated from (converted) reverse strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3493 $counting{CT_GA_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3494 $alignment_strand = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3495 $read_conversion_info = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3496 $genome_conversion = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3497
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3498 ### reverse complement!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3499 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3500 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3501
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3502 ### results from GA converted reads vs. CT converted genome (- orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3503 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3504 ### [Index 2, sequence originated from complementary to (converted) forward strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3505 $counting{GA_CT_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3506 $alignment_strand = '-';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3507 $read_conversion_info = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3508 $genome_conversion = 'CT';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3509
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3510 ### reverse complement!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3511 $non_bisulfite_sequence = reverse_complement($non_bisulfite_sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3512 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3513
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3514 ### results from GA converted reads vs. GA converted genome (+ orientation alignments are reported only)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3515 elsif ($methylation_call_params->{$sequence_identifier}->{index} == 3){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3516 ### [Index 3, sequence originated from complementary to (converted) reverse strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3517 $counting{GA_GA_count}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3518 $alignment_strand = '+';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3519 $read_conversion_info = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3520 $genome_conversion = 'GA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3521
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3522 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3523 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3524 die "Too many Bowtie 2 result filehandles\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3525 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3526
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3527 $methylation_call_params->{$sequence_identifier}->{alignment_strand} = $alignment_strand;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3528 $methylation_call_params->{$sequence_identifier}->{read_conversion} = $read_conversion_info;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3529 $methylation_call_params->{$sequence_identifier}->{genome_conversion} = $genome_conversion;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3530 $methylation_call_params->{$sequence_identifier}->{unmodified_genomic_sequence} = $non_bisulfite_sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3531
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3532 ### the end position of a read is stored in $pos
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3533 $methylation_call_params->{$sequence_identifier}->{end_position} = $pos;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3534 $methylation_call_params->{$sequence_identifier}->{indels} = $indels;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3535 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3536
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3537 ### METHYLATION CALL
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3538
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3539 sub methylation_call{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3540 my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3541 ### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3542 my @seq = split(//,$sequence_actually_observed);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3543 my @genomic = split(//,$genomic_sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3544 # print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3545 ### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3546 ### CpG, CHH or CHG context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3547
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3548 #################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3549 ### . for bases not involving cytosines ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3550 ### X for methylated C in CHG context (was protected) ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3551 ### x for not methylated C in CHG context (was converted) ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3552 ### H for methylated C in CHH context (was protected) ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3553 ### h for not methylated C in CHH context (was converted) ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3554 ### Z for methylated C in CpG context (was protected) ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3555 ### z for not methylated C in CpG context (was converted) ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3556 #################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3557
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3558 my @match =();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3559 warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3560 my $methyl_CHH_count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3561 my $methyl_CHG_count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3562 my $methyl_CpG_count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3563 my $unmethylated_CHH_count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3564 my $unmethylated_CHG_count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3565 my $unmethylated_CpG_count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3566
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3567 if ($read_conversion eq 'CT'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3568 for my $index (0..$#seq) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3569 if ($seq[$index] eq $genomic[$index]) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3570 ### The residue can only be a C if it was not converted to T, i.e. protected my methylation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3571 if ($genomic[$index] eq 'C') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3572 ### If the residue is a C we want to know if it was in CpG context or in any other context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3573 my $downstream_base = $genomic[$index+1];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3574
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3575 if ($downstream_base eq 'G'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3576 ++$methyl_CpG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3577 push @match,'Z'; # protected C, methylated, in CpG context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3578 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3579
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3580 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3581 ### C in not in CpG-context, determining the second downstream base context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3582 my $second_downstream_base = $genomic[$index+2];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3583
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3584 if ($second_downstream_base eq 'G'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3585 ++$methyl_CHG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3586 push @match,'X'; # protected C, methylated, in CHG context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3587 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3588 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3589 ++$methyl_CHH_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3590 push @match,'H'; # protected C, methylated, in CHH context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3591 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3592 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3593 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3594 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3595 push @match, '.';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3596 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3597 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3598 elsif ($seq[$index] ne $genomic[$index]) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3599 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted into Ts
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3600 ### in the actually observed sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3601 if ($genomic[$index] eq 'C' and $seq[$index] eq 'T') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3602 ### If the residue was converted to T we want to know if it was in CpG, CHG or CHH context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3603 my $downstream_base = $genomic[$index+1];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3604
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3605 if ($downstream_base eq 'G'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3606 ++$unmethylated_CpG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3607 push @match,'z'; # converted C, not methylated, in CpG context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3608 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3609
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3610 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3611 ### C in not in CpG-context, determining the second downstream base context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3612 my $second_downstream_base = $genomic[$index+2];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3613
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3614 if ($second_downstream_base eq 'G'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3615 ++$unmethylated_CHG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3616 push @match,'x'; # converted C, not methylated, in CHG context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3617 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3618 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3619 ++$unmethylated_CHH_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3620 push @match,'h'; # converted C, not methylated, in CHH context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3621 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3622 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3623 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3624 ### all other mismatches are not of interest for a methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3625 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3626 push @match,'.';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3627 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3628 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3629 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3630 die "There can be only 2 possibilities\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3631 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3632 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3633 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3634 elsif ($read_conversion eq 'GA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3635 # print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3636
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3637 for my $index (0..$#seq) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3638 if ($seq[$index] eq $genomic[$index+2]) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3639 ### The residue can only be a G if the C on the other strand was not converted to T, i.e. protected my methylation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3640 if ($genomic[$index+2] eq 'G') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3641 ### If the residue is a G we want to know if the C on the other strand was in CpG, CHG or CHH context, therefore we need
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3642 ### to look if the base upstream is a C
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3643
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3644 my $upstream_base = $genomic[$index+1];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3645
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3646 if ($upstream_base eq 'C'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3647 ++$methyl_CpG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3648 push @match,'Z'; # protected C on opposing strand, methylated, in CpG context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3649 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3650
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3651 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3652 ### C in not in CpG-context, determining the second upstream base context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3653 my $second_upstream_base = $genomic[$index];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3654
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3655 if ($second_upstream_base eq 'C'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3656 ++$methyl_CHG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3657 push @match,'X'; # protected C on opposing strand, methylated, in CHG context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3658 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3659 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3660 ++$methyl_CHH_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3661 push @match,'H'; # protected C on opposing strand, methylated, in CHH context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3662 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3663 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3664 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3665 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3666 push @match, '.';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3667 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3668 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3669 elsif ($seq[$index] ne $genomic[$index+2]) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3670 ### for the methylation call we are only interested in mismatches involving cytosines (in the genomic sequence) which were converted to Ts
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3671 ### on the opposing strand, so G to A conversions in the actually observed sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3672 if ($genomic[$index+2] eq 'G' and $seq[$index] eq 'A') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3673 ### If the C residue on the opposing strand was converted to T then we will see an A in the currently observed sequence. We want to know if
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3674 ### the C on the opposing strand was it was in CpG, CHG or CHH context, therefore we need to look one (or two) bases upstream!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3675
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3676 my $upstream_base = $genomic[$index+1];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3677
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3678 if ($upstream_base eq 'C'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3679 ++$unmethylated_CpG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3680 push @match,'z'; # converted C on opposing strand, not methylated, in CpG context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3681 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3682
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3683 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3684 ### C in not in CpG-context, determining the second upstream base context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3685 my $second_upstream_base = $genomic[$index];
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3686
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3687 if ($second_upstream_base eq 'C'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3688 ++$unmethylated_CHG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3689 push @match,'x'; # converted C on opposing strand, not methylated, in CHG context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3690 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3691 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3692 ++$unmethylated_CHH_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3693 push @match,'h'; # converted C on opposing strand, not methylated, in CHH context
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3694 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3695 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3696 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3697 ### all other mismatches are not of interest for a methylation call
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3698 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3699 push @match,'.';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3700 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3701 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3702 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3703 die "There can be only 2 possibilities\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3704 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3705 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3706 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3707 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3708 die "Strand conversion info is required to perform a methylation call\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3709 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3710
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3711 my $methylation_call = join ("",@match);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3712
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3713 $counting{total_meCHH_count} += $methyl_CHH_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3714 $counting{total_meCHG_count} += $methyl_CHG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3715 $counting{total_meCpG_count} += $methyl_CpG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3716 $counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3717 $counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3718 $counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3719
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3720 # print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3721 return $methylation_call;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3722 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3723
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3724 sub read_genome_into_memory{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3725 ## working directoy
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3726 my $cwd = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3727 ## reading in and storing the specified genome in the %chromosomes hash
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3728 chdir ($genome_folder) or die "Can't move to $genome_folder: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3729 print "Now reading in and storing sequence information of the genome specified in: $genome_folder\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3730
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3731 my @chromosome_filenames = <*.fa>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3732
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3733 ### if there aren't any genomic files with the extension .fa we will look for files with the extension .fasta
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3734 unless (@chromosome_filenames){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3735 @chromosome_filenames = <*.fasta>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3736 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3737
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3738 unless (@chromosome_filenames){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3739 die "The specified genome folder $genome_folder does not contain any sequence files in FastA format (with .fa or .fasta file extensions)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3740 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3741
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3742 foreach my $chromosome_filename (@chromosome_filenames){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3743
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3744 open (CHR_IN,$chromosome_filename) or die "Failed to read from sequence file $chromosome_filename $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3745 ### first line needs to be a fastA header
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3746 my $first_line = <CHR_IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3747 chomp $first_line;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3748 $first_line =~ s/\r//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3749
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3750 ### Extracting chromosome name from the FastA header
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3751 my $chromosome_name = extract_chromosome_name($first_line);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3752
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3753 my $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3754 while (<CHR_IN>){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3755 chomp;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3756 $_ =~ s/\r//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3757 if ($_ =~ /^>/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3758 ### storing the previous chromosome in the %chromosomes hash, only relevant for Multi-Fasta-Files (MFA)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3759 if (exists $chromosomes{$chromosome_name}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3760 print "chr $chromosome_name (",length $sequence ," bp)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3761 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3762 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3763 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3764 if (length($sequence) == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3765 warn "Chromosome $chromosome_name in the multi-fasta file $chromosome_filename did not contain any sequence information!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3766 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3767 print "chr $chromosome_name (",length $sequence ," bp)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3768 $chromosomes{$chromosome_name} = $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3769 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3770 ### resetting the sequence variable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3771 $sequence = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3772 ### setting new chromosome name
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3773 $chromosome_name = extract_chromosome_name($_);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3774 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3775 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3776 $sequence .= uc$_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3777 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3778 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3779
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3780 if (exists $chromosomes{$chromosome_name}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3781 print "chr $chromosome_name (",length $sequence ," bp)\t";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3782 die "Exiting because chromosome name already exists. Please make sure all chromosomes have a unique name.\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3783 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3784 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3785 if (length($sequence) == 0){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3786 warn "Chromosome $chromosome_name in the file $chromosome_filename did not contain any sequence information!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3787 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3788 print "chr $chromosome_name (",length $sequence ," bp)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3789 $chromosomes{$chromosome_name} = $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3790 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3791 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3792 print "\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3793 chdir $cwd or die "Failed to move to directory $cwd\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3794 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3795
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3796 sub extract_chromosome_name {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3797 ## Bowtie seems to extract the first string after the inition > in the FASTA file, so we are doing this as well
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3798 my $fasta_header = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3799 if ($fasta_header =~ s/^>//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3800 my ($chromosome_name) = split (/\s+/,$fasta_header);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3801 return $chromosome_name;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3802 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3803 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3804 die "The specified chromosome ($fasta_header) file doesn't seem to be in FASTA format as required!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3805 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3806 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3807
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3808 sub reverse_complement{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3809 my $sequence = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3810 $sequence =~ tr/CATG/GTAC/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3811 $sequence = reverse($sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3812 return $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3813 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3814
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3815 sub biTransformFastAFiles {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3816 my $file = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3817 my ($dir,$filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3818 if ($file =~ /\//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3819 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3820 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3821 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3822 $filename = $file;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3823 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3824
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3825 ### gzipped version of the infile
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3826 if ($file =~ /\.gz$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3827 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3828 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3829 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3830 open (IN,$file) or die "Couldn't read from file $file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3831 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3832
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3833 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3834 warn "Skipping the first $skip reads from $file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3835 sleep (1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3836 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3837 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3838 warn "Processing reads up to sequence no. $upto from $file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3839 sleep (1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3840 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3841
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3842 my $C_to_T_infile = my $G_to_A_infile = $filename;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3843 $C_to_T_infile =~ s/$/_C_to_T.fa/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3844 $G_to_A_infile =~ s/$/_G_to_A.fa/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3845 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3846 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3847
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3848 unless ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3849 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3850 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3851 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3852
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3853 my $count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3854 while (1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3855 my $header = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3856 my $sequence= <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3857 last unless ($header and $sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3858
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3859 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3860
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3861 ++$count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3862
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3863 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3864 next unless ($count > $skip);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3865 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3866 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3867 last if ($count > $upto);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3868 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3869
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3870 $sequence = uc$sequence; # make input file case insensitive
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3871
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3872 # detecting if the input file contains tab stops, as this is likely to result in no alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3873 if (index($header,"\t") != -1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3874 $seqID_contains_tabs++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3875 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3876
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3877 ### small check if the sequence seems to be in FastA format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3878 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3879
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3880 my $sequence_C_to_T = $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3881 $sequence_C_to_T =~ tr/C/T/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3882 print CTOT "$header$sequence_C_to_T";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3883
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3884 unless ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3885 my $sequence_G_to_A = $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3886 $sequence_G_to_A =~ tr/G/A/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3887 print GTOA "$header$sequence_G_to_A";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3888 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3889 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3890 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3891 print "\nCreated C -> T converted versions of the FastA file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3892 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3893 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3894 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3895 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3896 return ($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3897 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3898
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3899 sub biTransformFastAFiles_paired_end {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3900 my ($file,$read_number) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3901
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3902 my ($dir,$filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3903 if ($file =~ /\//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3904 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3905 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3906 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3907 $filename = $file;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3908 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3909
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3910 ### gzipped version of the infile
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3911 if ($file =~ /\.gz$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3912 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3913 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3914 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3915 open (IN,$file) or die "Couldn't read from file $file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3916 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3917
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3918 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3919 warn "Skipping the first $skip reads from $file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3920 sleep (1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3921 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3922 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3923 warn "Processing reads up to sequence no. $upto from $file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3924 sleep (1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3925 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3926
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3927 my $C_to_T_infile = my $G_to_A_infile = $filename;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3928 $C_to_T_infile =~ s/$/_C_to_T.fa/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3929 $G_to_A_infile =~ s/$/_G_to_A.fa/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3930
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3931 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3932 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3933 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3934 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3935 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3936 elsif ($read_number == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3937 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3938 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3939 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3940 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3941 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3942 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3943 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3944 else{ # all four strand output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3945 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3946 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3947 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3948 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3949 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3950
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3951 my $count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3952
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3953 while (1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3954 my $header = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3955 my $sequence= <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3956 last unless ($header and $sequence);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3957
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3958 $header = fix_IDs($header); # this is to avoid problems with truncated read ID when they contain white spaces
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3959
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3960 ++$count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3961
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3962 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3963 next unless ($count > $skip);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3964 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3965 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3966 last if ($count > $upto);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3967 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3968
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3969 $sequence = uc$sequence; # make input file case insensitive
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3970
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3971 # detecting if the input file contains tab stops, as this is likely to result in no alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3972 if (index($header,"\t") != -1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3973 $seqID_contains_tabs++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3974 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3975
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3976 ## small check if the sequence seems to be in FastA format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3977 die "Input file doesn't seem to be in FastA format at sequence $count: $!\n" unless ($header =~ /^>.*/);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3978
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3979 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3980 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3981 $header =~ s/$/\/1\/1/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3982 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3983 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3984 $header =~ s/$/\/1/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3985 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3986 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3987 elsif ($read_number == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3988 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3989 $header =~ s/$/\/2\/2/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3990 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3991 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3992 $header =~ s/$/\/2/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3993 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3994 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3995 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3996 die "Read number needs to be 1 or 2, but was: $read_number\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3997 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3998 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
3999
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4000 $sequence_C_to_T =~ tr/C/T/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4001 $sequence_G_to_A =~ tr/G/A/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4002
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4003 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4004
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4005 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4006 print CTOT "$header$sequence_C_to_T";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4007 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4008 elsif ($read_number == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4009 print GTOA "$header$sequence_G_to_A";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4010 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4011 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4012 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4013 print CTOT "$header$sequence_C_to_T";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4014 print GTOA "$header$sequence_G_to_A";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4015 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4016 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4017
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4018 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4019 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4020 print "\nCreated C -> T converted version of the FastA file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4021 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4022 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4023 print "\nCreated G -> A converted version of the FastA file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4024 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4025 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4026 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4027 print "\nCreated C -> T as well as G -> A converted versions of the FastA file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4028 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4029
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4030 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4031 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4032 return ($C_to_T_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4033 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4034 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4035 return ($G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4036 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4037 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4038 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4039 return ($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4040 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4041 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4042
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4043
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4044 sub biTransformFastQFiles {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4045 my $file = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4046 my ($dir,$filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4047 if ($file =~ /\//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4048 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4049 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4050 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4051 $filename = $file;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4052 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4053
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4054 ### gzipped version of the infile
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4055 if ($file =~ /\.gz$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4056 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4057 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4058 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4059 open (IN,$file) or die "Couldn't read from file $file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4060 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4061
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4062 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4063 warn "Skipping the first $skip reads from $file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4064 sleep (1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4065 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4066 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4067 warn "Processing reads up to sequence no. $upto from $file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4068 sleep (1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4069 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4070
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4071 my $C_to_T_infile = my $G_to_A_infile = $filename;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4072
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4073 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4074 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4075 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4076
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4077 unless ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4078 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4079 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4080 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4081 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4082
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4083 my $count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4084 while (1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4085 my $identifier = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4086 my $sequence = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4087 my $identifier2 = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4088 my $quality_score = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4089 last unless ($identifier and $sequence and $identifier2 and $quality_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4090
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4091 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4092
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4093 ++$count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4094
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4095 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4096 next unless ($count > $skip);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4097 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4098 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4099 last if ($count > $upto);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4100 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4101
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4102 $sequence = uc$sequence; # make input file case insensitive
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4103
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4104 # detecting if the input file contains tab stops, as this is likely to result in no alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4105 if (index($identifier,"\t") != -1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4106 $seqID_contains_tabs++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4107 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4108
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4109 ## small check if the sequence file appears to be a FastQ file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4110 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4111 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4112 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4113
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4114 my $sequence_C_to_T = $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4115 $sequence_C_to_T =~ tr/C/T/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4116 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4117
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4118 unless ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4119 my $sequence_G_to_A = $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4120 $sequence_G_to_A =~ tr/G/A/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4121 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4122 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4123 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4124
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4125 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4126 print "\nCreated C -> T converted versions of the FastQ file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4127 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4128 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4129 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4130 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4131
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4132 return ($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4133 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4134
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4135 sub biTransformFastQFiles_paired_end {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4136 my ($file,$read_number) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4137 my ($dir,$filename);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4138
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4139 if ($file =~ /\//){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4140 ($dir,$filename) = $file =~ m/(.*\/)(.*)$/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4141 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4142 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4143 $filename = $file;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4144 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4145
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4146 ### gzipped version of the infile
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4147 if ($file =~ /\.gz$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4148 open (IN,"zcat $file |") or die "Couldn't read from file $file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4149 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4150 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4151 open (IN,$file) or die "Couldn't read from file $file: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4152 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4153
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4154 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4155 warn "Skipping the first $skip reads from $file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4156 sleep (1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4157 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4158 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4159 warn "Processing reads up to sequence no. $upto from $file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4160 sleep (1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4161 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4162
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4163 my $C_to_T_infile = my $G_to_A_infile = $filename;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4164 $C_to_T_infile =~ s/$/_C_to_T.fastq/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4165 $G_to_A_infile =~ s/$/_G_to_A.fastq/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4166
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4167 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4168 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4169 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4170 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4171 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4172 elsif ($read_number == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4173 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4174 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4175 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4176 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4177 die "Read number needs to be 1 or 2, but was $read_number!\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4178 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4179 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4180 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4181 print "Writing a C -> T converted version of the input file $filename to $temp_dir$C_to_T_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4182 print "Writing a G -> A converted version of the input file $filename to $temp_dir$G_to_A_infile\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4183 open (CTOT,'>',"$temp_dir$C_to_T_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4184 open (GTOA,'>',"$temp_dir$G_to_A_infile") or die "Couldn't write to file $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4185 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4186
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4187 my $count = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4188
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4189 while (1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4190 my $identifier = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4191 my $sequence = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4192 my $identifier2 = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4193 my $quality_score = <IN>;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4194 last unless ($identifier and $sequence and $identifier2 and $quality_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4195 ++$count;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4196
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4197 $identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4198
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4199 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4200 next unless ($count > $skip);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4201 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4202 if ($upto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4203 last if ($count > $upto);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4204 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4205
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4206 $sequence= uc$sequence; # make input file case insensitive
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4207
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4208 ## small check if the sequence file appears to be a FastQ file
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4209 if ($identifier !~ /^\@/ or $identifier2 !~ /^\+/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4210 die "Input file doesn't seem to be in FastQ format at sequence $count: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4211 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4212 my $sequence_C_to_T = my $sequence_G_to_A = $sequence;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4213
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4214 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4215 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4216 $identifier =~ s/$/\/1\/1/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4217 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4218 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4219 $identifier =~ s/$/\/1/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4220 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4221 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4222 elsif ($read_number == 2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4223 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4224 $identifier =~ s/$/\/2\/2/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4225 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4226 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4227 $identifier =~ s/$/\/2/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4228 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4229 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4230 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4231 die "Read number needs to be 1 or 2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4232 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4233
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4234 $sequence_C_to_T =~ tr/C/T/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4235 $sequence_G_to_A =~ tr/G/A/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4236
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4237 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4238 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4239 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4240 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4241 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4242 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4243 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4244 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4245 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4246 print CTOT join ('',$identifier,$sequence_C_to_T,$identifier2,$quality_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4247 print GTOA join ('',$identifier,$sequence_G_to_A,$identifier2,$quality_score);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4248 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4249 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4250
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4251 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4252 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4253 print "\nCreated C -> T converted version of the FastQ file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4254 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4255 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4256 print "\nCreated G -> A converted version of the FastQ file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4257 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4258 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4259 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4260 print "\nCreated C -> T as well as G -> A converted versions of the FastQ file $filename ($count sequences in total)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4261 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4262 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4263 if ($read_number == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4264 return ($C_to_T_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4265 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4266 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4267 return ($G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4268 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4269 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4270 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4271 return ($C_to_T_infile,$G_to_A_infile);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4272 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4273 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4274
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4275 sub fix_IDs{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4276 my $id = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4277 $id =~ s/[ \t]+/_/g; # replace spaces or tabs with underscores
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4278 return $id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4279 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4280
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4281 sub ensure_sensical_alignment_orientation_single_end{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4282 my $index = shift; # index number if the sequence produced an alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4283 my $strand = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4284 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4285 my $orientation = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4286 ##############################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4287 ## FORWARD converted read against FORWARD converted genome (read: C->T.....C->T.. genome:C->T.......C->T)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4288 ## here we only want reads in the forward (+) orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4289 if ($fhs[$index]->{name} eq 'CTreadCTgenome') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4290 ### if the alignment is (+) we count it, and return 1 for a correct orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4291 if ($strand eq '+') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4292 $fhs[$index]->{seen}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4293 $orientation = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4294 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4295 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4296 ### if the orientation equals (-) the alignment is nonsensical
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4297 elsif ($strand eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4298 $fhs[$index]->{wrong_strand}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4299 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4300 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4301 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4302 ###############################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4303 ## FORWARD converted read against reverse converted genome (read: C->T.....C->T.. genome: G->A.......G->A)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4304 ## here we only want reads in the forward (-) orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4305 elsif ($fhs[$index]->{name} eq 'CTreadGAgenome') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4306 ### if the alignment is (-) we count it and return 1 for a correct orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4307 if ($strand eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4308 $fhs[$index]->{seen}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4309 $orientation = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4310 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4311 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4312 ### if the orientation equals (+) the alignment is nonsensical
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4313 elsif ($strand eq '+') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4314 $fhs[$index]->{wrong_strand}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4315 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4316 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4317 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4318 ###############################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4319 ## Reverse converted read against FORWARD converted genome (read: G->A.....G->A.. genome: C->T.......C->T)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4320 ## here we only want reads in the forward (-) orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4321 elsif ($fhs[$index]->{name} eq 'GAreadCTgenome') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4322 ### if the alignment is (-) we count it and return 1 for a correct orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4323 if ($strand eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4324 $fhs[$index]->{seen}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4325 $orientation = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4326 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4327 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4328 ### if the orientation equals (+) the alignment is nonsensical
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4329 elsif ($strand eq '+') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4330 $fhs[$index]->{wrong_strand}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4331 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4332 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4333 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4334 ###############################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4335 ## Reverse converted read against reverse converted genome (read: G->A.....G->A.. genome: G->A.......G->A)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4336 ## here we only want reads in the forward (+) orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4337 elsif ($fhs[$index]->{name} eq 'GAreadGAgenome') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4338 ### if the alignment is (+) we count it and return 1 for a correct orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4339 if ($strand eq '+') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4340 $fhs[$index]->{seen}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4341 $orientation = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4342 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4343 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4344 ### if the orientation equals (-) the alignment is nonsensical
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4345 elsif ($strand eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4346 $fhs[$index]->{wrong_strand}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4347 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4348 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4349 } else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4350 die "One of the above conditions must be true\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4351 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4352 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4353
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4354 sub ensure_sensical_alignment_orientation_paired_ends{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4355 my ($index,$id_1,$strand_1,$id_2,$strand_2) = @_; # index number if the sequence produced an alignment
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4356 ### setting $orientation to 1 if it is in the correct orientation, and leave it 0 if it is the nonsensical wrong one
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4357 my $orientation = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4358 ##############################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4359 ## [Index 0, sequence originated from (converted) forward strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4360 ## CT converted read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4361 ## GA converted read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4362 ## CT converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4363 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4364 if ($fhs[$index]->{name} eq 'CTread1GAread2CTgenome') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4365 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4366 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4367 $fhs[$index]->{seen}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4368 $orientation = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4369 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4370 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4371 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4372 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4373 $fhs[$index]->{wrong_strand}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4374 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4375 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4376 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4377 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4378 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4379 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4380 ###############################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4381 ## [Index 1, sequence originated from (converted) reverse strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4382 ## GA converted read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4383 ## CT converted read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4384 ## GA converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4385 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4386 elsif ($fhs[$index]->{name} eq 'GAread1CTread2GAgenome') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4387 ### if the paired-end alignment is read1 (+) and read2 (-) we count it, and return 1 for a correct orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4388 if ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4389 $fhs[$index]->{seen}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4390 $orientation = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4391 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4392 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4393 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4394 elsif ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4395 $fhs[$index]->{wrong_strand}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4396 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4397 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4398 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4399 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4400 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4401 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4402 ###############################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4403 ## [Index 2, sequence originated from complementary to (converted) forward strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4404 ## GA converted read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4405 ## CT converted read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4406 ## CT converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4407 ## here we only want read 1 in (-) orientation and read 2 in (+) orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4408 elsif ($fhs[$index]->{name} eq 'GAread1CTread2CTgenome') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4409 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4410 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4411 $fhs[$index]->{seen}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4412 $orientation = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4413 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4414 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4415 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4416 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4417 $fhs[$index]->{wrong_strand}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4418 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4419 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4420 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4421 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4422 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4423 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4424 ###############################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4425 ## [Index 3, sequence originated from complementary to (converted) reverse strand]
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4426 ## CT converted read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4427 ## GA converted read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4428 ## GA converted genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4429 ## here we only want read 1 in (+) orientation and read 2 in (-) orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4430 elsif ($fhs[$index]->{name} eq 'CTread1GAread2GAgenome') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4431 ### if the paired-end alignment is read1 (-) and read2 (+) we count it, and return 1 for a correct orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4432 if ($id_1 =~ /2$/ and $strand_1 eq '+' and $id_2 =~ /1$/ and $strand_2 eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4433 $fhs[$index]->{seen}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4434 $orientation = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4435 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4436 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4437 ### if the read 2 is in (+) orientation and read 1 in (-) the alignment is nonsensical
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4438 elsif ($id_1 =~ /1$/ and $strand_1 eq '+' and $id_2 =~ /2$/ and $strand_2 eq '-') {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4439 $fhs[$index]->{wrong_strand}++;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4440 return $orientation;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4441 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4442 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4443 die "id1: $id_1\tid2: $id_2\tThis should be impossible\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4444 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4445 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4446 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4447 die "One of the above conditions must be true\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4448 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4449 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4450
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4451 #####################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4452
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4453 ### Bowtie 1 (default) | PAIRED-END | FASTA
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4454
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4455 sub paired_end_align_fragments_to_bisulfite_genome_fastA {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4456
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4457 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4458
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4459 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4460 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4461 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4462 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4463 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4464 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4465
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4466 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4467 ## data structure above
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4468 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4469 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4470 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4471 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4472 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4473 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4474
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4475 foreach my $fh (@fhs) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4476
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4477 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4478 unless ($fh->{inputfile_1}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4479 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4480 $fh->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4481 $fh->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4482 next;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4483 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4484 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4485
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4486 my $bt_options = $bowtie_options;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4487 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4488 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4489 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4490 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4491 $bt_options .= ' --nofw';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4492 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4493
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4494 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4495 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4496
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4497 my $line_1 = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4498 my $line_2 = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4499
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4500 # if Bowtie produces an alignment we store the first line of the output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4501 if ($line_1 and $line_2) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4502 chomp $line_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4503 chomp $line_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4504 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4505 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4506
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4507 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4508 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4509
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4510 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4511 $fh->{last_seq_id} = $id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4512 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4513 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4514 $fh->{last_seq_id} = $id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4515 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4516 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4517 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4518 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4519
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4520 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4521 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4522 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4523 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4524 # otherwise we just initialise last_seq_id and last_lines as undefined
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4525 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4526 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4527 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4528 $fh->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4529 $fh->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4530 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4531 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4532 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4533
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4534 ### Bowtie 2 | PAIRED-END | FASTA
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4535
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4536 sub paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4537 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4538 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4539 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastA)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4540 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4541 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4542 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastA)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4543 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4544
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4545 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4546 ## data structure above
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4547 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4548 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4549 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4550 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4551 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4552 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4553
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4554 foreach my $fh (@fhs) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4555
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4556 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4557 unless ($fh->{inputfile_1}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4558 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4559 $fh->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4560 $fh->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4561 next;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4562 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4563 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4564
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4565 my $bt2_options = $bowtie_options;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4566 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4567 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4568 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4569 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4570 $bt2_options .= ' --nofw';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4571 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4572
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4573 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4574 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4575
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4576 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4577 while (1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4578 $_ = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4579 if ($_) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4580 last unless ($_ =~ /^\@/); # SAM headers start with @
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4581 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4582 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4583 last; # no alignment output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4584 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4585 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4586
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4587 my $line_1 = $_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4588 my $line_2 = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4589
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4590 # if Bowtie produces an alignment we store the first line of the output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4591 if ($line_1 and $line_2) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4592 chomp $line_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4593 chomp $line_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4594 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4595 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4596
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4597 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4598 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4599
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4600 if ($id_1 =~ s/\/1$//){ # removing the read 1 /1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4601 $fh->{last_seq_id} = $id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4602 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4603 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 /2 tag if present
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4604 $fh->{last_seq_id} = $id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4605 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4606 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4607 warn "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4608 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4609
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4610 $fh->{last_line_1} = $line_1; # this contains either read 1 or read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4611 $fh->{last_line_2} = $line_2; # this contains either read 1 or read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4612 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4613 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4614 # otherwise we just initialise last_seq_id and last_lines as undefined
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4615 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4616 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4617 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4618 $fh->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4619 $fh->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4620 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4621 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4622 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4623
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4624 ### Bowtie 1 (default) | PAIRED-END | FASTQ
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4625
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4626 sub paired_end_align_fragments_to_bisulfite_genome_fastQ {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4627 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4628 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4629 print "Input files are $C_to_T_infile_1 $G_to_A_infile_2 (FastQ)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4630 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4631 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4632 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4633 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4634
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4635 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4636 ## data structure above
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4637 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4638 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4639 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4640 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4641 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4642 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4643
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4644 foreach my $fh (@fhs) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4645
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4646 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4647 unless ($fh->{inputfile_1}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4648 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4649 $fh->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4650 $fh->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4651 next;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4652 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4653 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4654
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4655 my $bt_options = $bowtie_options;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4656 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4657 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4658 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4659 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4660 $bt_options .= ' --nofw';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4661 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4662
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4663 warn "Now starting a Bowtie paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt_options))\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4664 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4665
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4666 my $line_1 = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4667 my $line_2 = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4668
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4669 # if Bowtie produces an alignment we store the first line of the output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4670 if ($line_1 and $line_2) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4671 chomp $line_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4672 chomp $line_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4673 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4674 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4675
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4676 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4677 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4678
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4679 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4680 $fh->{last_seq_id} = $id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4681 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4682 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4683 $fh->{last_seq_id} = $id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4684 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4685 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4686 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4687 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4688
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4689 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4690 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4691 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4692 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4693
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4694 # otherwise we just initialise last_seq_id and last_lines as undefined
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4695 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4696 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4697 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4698 $fh->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4699 $fh->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4700 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4701 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4702 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4703
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4704 ### Bowtie 2 | PAIRED-END | FASTQ
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4705
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4706 sub paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4707 my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4708 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4709 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_2 (FastQ)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4710 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4711 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4712 print "Input files are $C_to_T_infile_1 and $G_to_A_infile_1 and $C_to_T_infile_2 and $G_to_A_infile_2 (FastQ)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4713 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4714
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4715 ## Now starting up 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4716 ## data structure above
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4717 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4718 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4719 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4720 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4721 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4722 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4723
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4724 foreach my $fh (@fhs) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4725
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4726 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4727 unless ($fh->{inputfile_1}){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4728 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4729 $fh->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4730 $fh->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4731 next;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4732 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4733 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4734
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4735 my $bt2_options = $bowtie_options;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4736 if ($fh->{name} eq 'CTread1GAread2CTgenome' or $fh->{name} eq 'GAread1CTread2GAgenome'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4737 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4738 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4739 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4740 $bt2_options .= ' --nofw';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4741 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4742
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4743 warn "Now starting a Bowtie 2 paired-end alignment for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile_1} and $temp_dir$fh->{inputfile_2}, with the options: $bt2_options))\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4744 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -1 $temp_dir$fh->{inputfile_1} -2 $temp_dir$fh->{inputfile_2} |") or die "Can't open pipe to bowtie: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4745
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4746 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4747 while (1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4748 $_ = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4749 if ($_) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4750 last unless ($_ =~ /^\@/); # SAM headers start with @
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4751 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4752 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4753 last; # no alignment output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4754 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4755 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4756
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4757 my $line_1 = $_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4758 my $line_2 = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4759
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4760 # if Bowtie produces an alignment we store the first line of the output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4761 if ($line_1 and $line_2) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4762 chomp $line_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4763 chomp $line_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4764 ### Bowtie always reports the alignment with the smaller chromosomal position first. This can be either sequence 1 or sequence 2.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4765 ### We will thus identify which sequence was read 1 and store this ID as last_seq_id
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4766
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4767 my $id_1 = (split(/\t/,$line_1))[0]; # this is the first element of the first bowtie output line (= the sequence identifier)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4768 my $id_2 = (split(/\t/,$line_2))[0]; # this is the first element of the second bowtie output line
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4769
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4770 if ($id_1 =~ s/\/1$//){ # removing the read 1 tag if present (remember that Bowtie2 clips off /1 or /2 line endings itself, so we added /1/1 or /2/2 to start with
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4771 $fh->{last_seq_id} = $id_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4772 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4773 elsif ($id_2 =~ s/\/1$//){ # removing the read 1 tag if present
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4774 $fh->{last_seq_id} = $id_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4775 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4776 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4777 die "Either the first or the second id need to be read 1! ID1 was: $id_1; ID2 was: $id_2\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4778 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4779
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4780 $fh->{last_line_1} = $line_1; # this contains read 1 or read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4781 $fh->{last_line_2} = $line_2; # this contains read 1 or read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4782 warn "Found first alignment:\n$fh->{last_line_1}\n$fh->{last_line_2}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4783 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4784
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4785 # otherwise we just initialise last_seq_id and last_lines as undefined
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4786 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4787 print "Found no alignment, assigning undef to last_seq_id and last_lines\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4788 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4789 $fh->{last_line_1} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4790 $fh->{last_line_2} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4791 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4792 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4793 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4794
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4795 #####################################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4796
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4797 ### Bowtie 1 (default) | SINGLE-END | FASTA
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4798 sub single_end_align_fragments_to_bisulfite_genome_fastA {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4799 my ($C_to_T_infile,$G_to_A_infile) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4800 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4801 print "Input file is $C_to_T_infile (FastA)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4802 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4803 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4804 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4805 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4806
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4807 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4808 ## data structure above
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4809 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4810 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4811 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4812 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4813 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4814 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4815
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4816 foreach my $fh (@fhs) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4817
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4818 my $bt_options = $bowtie_options;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4819 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4820 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4821 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4822 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4823 $bt_options .= ' --nofw';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4824 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4825
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4826 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4827 open ($fh->{fh},"$path_to_bowtie $bt_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4828
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4829 # if Bowtie produces an alignment we store the first line of the output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4830 $_ = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4831 if ($_) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4832 chomp;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4833 my $id = (split(/\t/))[0]; # this is the first element of the bowtie output (= the sequence identifier)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4834 $fh->{last_seq_id} = $id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4835 $fh->{last_line} = $_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4836 warn "Found first alignment:\t$fh->{last_line}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4837 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4838 # otherwise we just initialise last_seq_id and last_line as undefined
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4839 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4840 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4841 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4842 $fh->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4843 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4844 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4845 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4846
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4847 ### Bowtie 2 | SINGLE-END | FASTA
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4848 sub single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4849 my ($C_to_T_infile,$G_to_A_infile) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4850 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4851 print "Input file is $C_to_T_infile (FastA)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4852 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4853 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4854 print "Input files are $C_to_T_infile and $G_to_A_infile (FastA)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4855 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4856
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4857 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4858 ## data structure above
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4859 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4860 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4861 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4862 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4863 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4864 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4865
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4866 foreach my $fh (@fhs) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4867
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4868 my $bt2_options = $bowtie_options;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4869 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4870 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4871 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4872 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4873 $bt2_options .= ' --nofw';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4874 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4875
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4876 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt2_options)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4877 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4878
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4879 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4880 while (1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4881 $_ = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4882 if ($_) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4883 last unless ($_ =~ /^\@/); # SAM headers start with @
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4884 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4885 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4886 last; # no alignment output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4887 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4888 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4889
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4890 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4891 if ($_) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4892 chomp;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4893 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4894 $fh->{last_seq_id} = $id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4895 $fh->{last_line} = $_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4896 warn "Found first alignment:\t$fh->{last_line}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4897 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4898 # otherwise we just initialise last_seq_id and last_line as undefinded. This should only happen at the end of a file for Bowtie 2 output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4899 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4900 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4901 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4902 $fh->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4903 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4904 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4905 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4906
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4907
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4908 ### Bowtie 1 (default) | SINGLE-END | FASTQ
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4909 sub single_end_align_fragments_to_bisulfite_genome_fastQ {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4910 my ($C_to_T_infile,$G_to_A_infile) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4911 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4912 print "Input file is $C_to_T_infile (FastQ)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4913 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4914 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4915 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4916 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4917
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4918 ## Now starting up to 4 instances of Bowtie feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4919 ## the data structure above
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4920 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4921 warn "Now running 2 instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4922 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4923 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4924 warn "Now running 4 individual instances of Bowtie against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4925 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4926
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4927 foreach my $fh (@fhs) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4928 my $bt_options = $bowtie_options;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4929 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4930 $bt_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4931 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4932 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4933 $bt_options .= ' --nofw';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4934 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4935
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4936 warn "Now starting the Bowtie aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options: $bt_options)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4937 open ($fh->{fh},"$path_to_bowtie $bowtie_options $fh->{bisulfiteIndex} $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4938
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4939 # if Bowtie produces an alignment we store the first line of the output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4940 $_ = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4941 if ($_) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4942 chomp;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4943 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie output (= the sequence identifier)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4944 $fh->{last_seq_id} = $id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4945 $fh->{last_line} = $_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4946 warn "Found first alignment:\t$fh->{last_line}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4947 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4948 # otherwise we just initialise last_seq_id and last_line as undefined
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4949 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4950 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4951 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4952 $fh->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4953 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4954 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4955 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4956
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4957 ### Bowtie 2 | SINGLE-END | FASTQ
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4958 sub single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4959 my ($C_to_T_infile,$G_to_A_infile) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4960 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4961 print "Input file is $C_to_T_infile (FastQ)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4962 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4963 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4964 print "Input files are $C_to_T_infile and $G_to_A_infile (FastQ)\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4965 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4966
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4967 ## Now starting up to 4 instances of Bowtie 2 feeding in the converted sequence files and reading in the first line of the bowtie output, and storing it in
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4968 ## the data structure above
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4969 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4970 warn "Now running 2 instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4971 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4972 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4973 warn "Now running 4 individual instances of Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $bowtie_options\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4974 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4975
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4976 foreach my $fh (@fhs) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4977 my $bt2_options = $bowtie_options;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4978 if ($fh->{name} eq 'CTreadCTgenome' or $fh->{name} eq 'GAreadGAgenome'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4979 $bt2_options .= ' --norc'; ### ensuring the alignments are only reported in a sensible manner
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4980 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4981 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4982 $bt2_options .= ' --nofw';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4983 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4984 warn "Now starting the Bowtie 2 aligner for $fh->{name} (reading in sequences from $temp_dir$fh->{inputfile} with options $bt2_options)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4985 warn "Using Bowtie 2 index: $fh->{bisulfiteIndex}\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4986
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4987 open ($fh->{fh},"$path_to_bowtie $bt2_options $fh->{bisulfiteIndex} -U $temp_dir$fh->{inputfile} |") or die "Can't open pipe to bowtie: $!";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4988 ### Bowtie 2 outputs out SAM format, so we need to skip everything until the first sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4989 while (1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4990 $_ = $fh->{fh}->getline();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4991 if ($_) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4992 last unless ($_ =~ /^\@/); # SAM headers start with @
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4993 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4994 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4995 last;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4996 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4997 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4998
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
4999 # Bowtie 2 outputs a result line even for sequences without any alignments. We thus store the first line of the output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5000 if ($_) {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5001 chomp;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5002 my $id = (split(/\t/))[0]; # this is the first element of the Bowtie 2 output (= the sequence identifier)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5003 $fh->{last_seq_id} = $id;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5004 $fh->{last_line} = $_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5005 warn "Found first alignment:\t$fh->{last_line}\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5006 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5007 # otherwise we just initialise last_seq_id and last_line as undefined. This should only happen at the end of a file for Bowtie 2 output
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5008 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5009 print "Found no alignment, assigning undef to last_seq_id and last_line\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5010 $fh->{last_seq_id} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5011 $fh->{last_line} = undef;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5012 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5013 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5014 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5015
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5016 ###########################################################################################################################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5017
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5018 sub reset_counters_and_fhs{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5019 my $filename = shift;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5020 %counting=(
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5021 total_meCHH_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5022 total_meCHG_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5023 total_meCpG_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5024 total_unmethylated_CHH_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5025 total_unmethylated_CHG_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5026 total_unmethylated_CpG_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5027 sequences_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5028 no_single_alignment_found => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5029 unsuitable_sequence_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5030 genomic_sequence_could_not_be_extracted_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5031 unique_best_alignment_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5032 low_complexity_alignments_overruled_count => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5033 CT_CT_count => 0, #(CT read/CT genome, original top strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5034 CT_GA_count => 0, #(CT read/GA genome, original bottom strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5035 GA_CT_count => 0, #(GA read/CT genome, complementary to original top strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5036 GA_GA_count => 0, #(GA read/GA genome, complementary to original bottom strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5037 CT_GA_CT_count => 0, #(CT read1/GA read2/CT genome, original top strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5038 GA_CT_GA_count => 0, #(GA read1/CT read2/GA genome, complementary to original bottom strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5039 GA_CT_CT_count => 0, #(GA read1/CT read2/CT genome, complementary to original top strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5040 CT_GA_GA_count => 0, #(CT read1/GA read2/GA genome, original bottom strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5041 alignments_rejected_count => 0, # only relevant if --directional was specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5042 );
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5043
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5044 if ($directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5045 if ($filename =~ ','){ # paired-end files
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5046 @fhs=(
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5047 { name => 'CTreadCTgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5048 strand_identity => 'con ori forward',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5049 bisulfiteIndex => $CT_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5050 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5051 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5052 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5053 { name => 'CTreadGAgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5054 strand_identity => 'con ori reverse',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5055 bisulfiteIndex => $GA_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5056 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5057 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5058 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5059 { name => 'GAreadCTgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5060 strand_identity => 'compl ori con forward',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5061 bisulfiteIndex => $CT_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5062 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5063 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5064 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5065 { name => 'GAreadGAgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5066 strand_identity => 'compl ori con reverse',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5067 bisulfiteIndex => $GA_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5068 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5069 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5070 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5071 );
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5072 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5073 else{ # single-end files
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5074 @fhs=(
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5075 { name => 'CTreadCTgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5076 strand_identity => 'con ori forward',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5077 bisulfiteIndex => $CT_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5078 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5079 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5080 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5081 { name => 'CTreadGAgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5082 strand_identity => 'con ori reverse',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5083 bisulfiteIndex => $GA_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5084 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5085 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5086 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5087 );
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5088 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5089 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5090 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5091 @fhs=(
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5092 { name => 'CTreadCTgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5093 strand_identity => 'con ori forward',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5094 bisulfiteIndex => $CT_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5095 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5096 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5097 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5098 { name => 'CTreadGAgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5099 strand_identity => 'con ori reverse',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5100 bisulfiteIndex => $GA_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5101 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5102 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5103 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5104 { name => 'GAreadCTgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5105 strand_identity => 'compl ori con forward',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5106 bisulfiteIndex => $CT_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5107 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5108 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5109 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5110 { name => 'GAreadGAgenome',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5111 strand_identity => 'compl ori con reverse',
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5112 bisulfiteIndex => $GA_index_basename,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5113 seen => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5114 wrong_strand => 0,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5115 },
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5116 );
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5117 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5118 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5119
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5120
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5121 sub process_command_line{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5122 my @bowtie_options;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5123 my $help;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5124 my $mates1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5125 my $mates2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5126 my $path_to_bowtie;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5127 my $fastq;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5128 my $fasta;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5129 my $skip;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5130 my $qupto;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5131 my $phred64;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5132 my $phred33;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5133 my $solexa;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5134 my $mismatches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5135 my $seed_length;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5136 my $best;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5137 my $sequence_format;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5138 my $version;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5139 my $quiet;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5140 my $chunk;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5141 my $non_directional;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5142 my $ceiling;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5143 my $maxins;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5144 my $minins;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5145 my $unmapped;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5146 my $multi_map;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5147 my $output_dir;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5148 my $bowtie2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5149 my $vanilla;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5150 my $sam_no_hd;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5151 my $seed_extension_fails;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5152 my $reseed_repetitive_seeds;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5153 my $most_valid_alignments;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5154 my $score_min;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5155 my $parallel;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5156 my $temp_dir;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5157 my $rdg;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5158 my $rfg;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5159
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5160 my $command_line = GetOptions ('help|man' => \$help,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5161 '1=s' => \$mates1,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5162 '2=s' => \$mates2,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5163 'path_to_bowtie=s' => \$path_to_bowtie,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5164 'f|fasta' => \$fasta,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5165 'q|fastq' => \$fastq,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5166 's|skip=i' => \$skip,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5167 'u|upto=i' => \$qupto,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5168 'phred33-quals' => \$phred33,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5169 'phred64-quals|solexa1' => \$phred64,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5170 'solexa-quals' => \$solexa,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5171 'n|seedmms=i' => \$mismatches,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5172 'l|seedlen=i' => \$seed_length,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5173 'no_best' => \$best,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5174 'version' => \$version,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5175 'quiet' => \$quiet,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5176 'chunkmbs=i' => \$chunk,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5177 'non_directional' => \$non_directional,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5178 'I|minins=i' => \$minins,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5179 'X|maxins=i' => \$maxins,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5180 'e|maqerr=i' => \$ceiling,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5181 'un|unmapped' => \$unmapped,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5182 'ambiguous' => \$multi_map,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5183 'o|output_dir=s' => \$output_dir,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5184 'bowtie2' => \$bowtie2,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5185 'vanilla' => \$vanilla,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5186 'sam-no-hd' => \$sam_no_hd,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5187 'D=i' => \$seed_extension_fails,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5188 'R=i' => \$reseed_repetitive_seeds,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5189 'score_min=s' => \$score_min,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5190 'most_valid_alignments=i' => \$most_valid_alignments,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5191 'p=i' => \$parallel,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5192 'temp_dir=s' => \$temp_dir,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5193 'rdg=s' => \$rdg,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5194 'rfg=s' => \$rfg,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5195 );
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5196
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5197
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5198 ### EXIT ON ERROR if there were errors with any of the supplied options
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5199 unless ($command_line){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5200 die "Please respecify command line options\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5201 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5202 ### HELPFILE
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5203 if ($help){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5204 print_helpfile();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5205 exit;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5206 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5207 if ($version){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5208 print << "VERSION";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5209
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5210
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5211 Bismark - Bisulfite Mapper and Methylation Caller.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5212
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5213 Bismark Version: $bismark_version Copyright 2010-12 Felix Krueger, Babraham Bioinformatics
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5214 www.bioinformatics.babraham.ac.uk/projects/
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5215
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5216
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5217 VERSION
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5218 exit;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5219 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5220
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5221
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5222 ##########################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5223 ### PROCESSING OPTIONS ###
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5224 ##########################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5225
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5226 unless ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5227 $bowtie2 = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5228 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5229 unless ($sam_no_hd){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5230 $sam_no_hd =0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5231 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5232
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5233 ### PATH TO BOWTIE
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5234 ### if a special path to Bowtie 1/2 was specified we will use that one, otherwise it is assumed that Bowtie 1/2 is in the PATH
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5235 if ($path_to_bowtie){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5236 unless ($path_to_bowtie =~ /\/$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5237 $path_to_bowtie =~ s/$/\//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5238 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5239 if (-d $path_to_bowtie){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5240 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5241 $path_to_bowtie = "${path_to_bowtie}bowtie2";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5242 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5243 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5244 $path_to_bowtie = "${path_to_bowtie}bowtie";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5245 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5246 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5247 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5248 die "The path to bowtie provided ($path_to_bowtie) is invalid (not a directory)!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5249 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5250 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5251 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5252 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5253 $path_to_bowtie = 'bowtie2';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5254 warn "Path to Bowtie 2 specified as: $path_to_bowtie\n"; }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5255 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5256 $path_to_bowtie = 'bowtie';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5257 warn "Path to Bowtie specified as: $path_to_bowtie\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5258 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5259 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5260
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5261 ####################################
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5262 ### PROCESSING ARGUMENTS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5263
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5264 ### GENOME FOLDER
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5265 my $genome_folder = shift @ARGV; # mandatory
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5266 unless ($genome_folder){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5267 warn "Genome folder was not specified!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5268 print_helpfile();
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5269 exit;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5270 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5271
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5272 ### checking that the genome folder, all subfolders and the required bowtie index files exist
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5273 unless ($genome_folder =~/\/$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5274 $genome_folder =~ s/$/\//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5275 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5276
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5277 if (chdir $genome_folder){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5278 my $absolute_genome_folder = getcwd; ## making the genome folder path absolute
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5279 unless ($absolute_genome_folder =~/\/$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5280 $absolute_genome_folder =~ s/$/\//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5281 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5282 warn "Reference genome folder provided is $genome_folder\t(absolute path is '$absolute_genome_folder)'\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5283 $genome_folder = $absolute_genome_folder;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5284 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5285 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5286 die "Failed to move to $genome_folder: $!\nUSAGE: Bismark.pl [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>} [<hits>] (--help for more details)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5287 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5288
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5289 my $CT_dir = "${genome_folder}Bisulfite_Genome/CT_conversion/";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5290 my $GA_dir = "${genome_folder}Bisulfite_Genome/GA_conversion/";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5291
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5292 if ($bowtie2){ ### Bowtie 2 (new)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5293 ### checking the integrity of $CT_dir
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5294 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5295 my @CT_bowtie_index = ('BS_CT.1.bt2','BS_CT.2.bt2','BS_CT.3.bt2','BS_CT.4.bt2','BS_CT.rev.1.bt2','BS_CT.rev.2.bt2');
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5296 foreach my $file(@CT_bowtie_index){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5297 unless (-f $file){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5298 die "The Bowtie 2 index of the C->T converted genome seems to be faulty ($file). Please run the bismark_genome_preparation before running Bismark.\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5299 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5300 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5301 ### checking the integrity of $GA_dir
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5302 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5303 my @GA_bowtie_index = ('BS_GA.1.bt2','BS_GA.2.bt2','BS_GA.3.bt2','BS_GA.4.bt2','BS_GA.rev.1.bt2','BS_GA.rev.2.bt2');
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5304 foreach my $file(@GA_bowtie_index){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5305 unless (-f $file){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5306 die "The Bowtie 2 index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5307 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5308 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5309 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5310
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5311 else{ ### Bowtie 1 (default)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5312 ### checking the integrity of $CT_dir
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5313 chdir $CT_dir or die "Failed to move to directory $CT_dir: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5314 my @CT_bowtie_index = ('BS_CT.1.ebwt','BS_CT.2.ebwt','BS_CT.3.ebwt','BS_CT.4.ebwt','BS_CT.rev.1.ebwt','BS_CT.rev.2.ebwt');
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5315 foreach my $file(@CT_bowtie_index){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5316 unless (-f $file){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5317 die "The Bowtie index of the C->T converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5318 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5319 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5320 ### checking the integrity of $GA_dir
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5321 chdir $GA_dir or die "Failed to move to directory $GA_dir: $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5322 my @GA_bowtie_index = ('BS_GA.1.ebwt','BS_GA.2.ebwt','BS_GA.3.ebwt','BS_GA.4.ebwt','BS_GA.rev.1.ebwt','BS_GA.rev.2.ebwt');
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5323 foreach my $file(@GA_bowtie_index){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5324 unless (-f $file){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5325 die "The Bowtie index of the G->A converted genome seems to be faulty ($file). Please run bismark_genome_preparation before running Bismark.\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5326 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5327 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5328 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5329
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5330 my $CT_index_basename = "${CT_dir}BS_CT";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5331 my $GA_index_basename = "${GA_dir}BS_GA";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5332
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5333 ### INPUT OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5334
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5335 ### SEQUENCE FILE FORMAT
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5336 ### exits if both fastA and FastQ were specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5337 if ($fasta and $fastq){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5338 die "Only one sequence filetype can be specified (fastA or fastQ)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5339 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5340
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5341 ### unless fastA is specified explicitely, fastQ sequence format is expected by default
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5342 if ($fasta){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5343 print "FastA format specified\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5344 $sequence_format = 'FASTA';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5345 push @bowtie_options, '-f';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5346 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5347 elsif ($fastq){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5348 print "FastQ format specified\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5349 $sequence_format = 'FASTQ';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5350 push @bowtie_options, '-q';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5351 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5352 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5353 $fastq = 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5354 print "FastQ format assumed (by default)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5355 $sequence_format = 'FASTQ';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5356 push @bowtie_options, '-q';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5357 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5358
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5359 ### SKIP
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5360 if ($skip){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5361 warn "Skipping the first $skip reads from the input file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5362 # push @bowtie_options,"-s $skip";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5363 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5364
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5365 ### UPTO
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5366 if ($qupto){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5367 warn "Processing sequences up to read no. $qupto from the input file\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5368 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5369 # push @bowtie_options,"--upto $qupto"; ## slightly changed for Bowtie 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5370 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5371 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5372 # push @bowtie_options,"--qupto $qupto";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5373 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5374 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5375
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5376 ### QUALITY VALUES
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5377 if (($phred33 and $phred64) or ($phred33 and $solexa) or ($phred64 and $solexa)){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5378 die "You can only specify one type of quality value at a time! (--phred33-quals or --phred64-quals or --solexa-quals)";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5379 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5380 if ($phred33){ ## if nothing else is specified $phred33 will be used as default by both Bowtie 1 and 2.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5381 # Phred quality values work only when -q is specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5382 unless ($fastq){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5383 die "Phred quality values works only when -q (FASTQ) is specified\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5384 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5385 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5386 push @bowtie_options,"--phred33";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5387 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5388 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5389 push @bowtie_options,"--phred33-quals";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5390 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5391 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5392 if ($phred64){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5393 # Phred quality values work only when -q is specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5394 unless ($fastq){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5395 die "Phred quality values work only when -q (FASTQ) is specified\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5396 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5397 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5398 push @bowtie_options,"--phred64";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5399 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5400 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5401 push @bowtie_options,"--phred64-quals";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5402 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5403 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5404 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5405 $phred64 = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5406 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5407
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5408 if ($solexa){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5409 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5410 die "The option '--solexa-quals' is not compatible with Bowtie 2. Please respecify!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5411 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5412 # Solexa to Phred value conversion works only when -q is specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5413 unless ($fastq){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5414 die "Conversion from Solexa to Phred quality values works only when -q (FASTQ) is specified\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5415 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5416 push @bowtie_options,"--solexa-quals";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5417 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5418 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5419 $solexa = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5420 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5421
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5422 ### ALIGNMENT OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5423
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5424 ### MISMATCHES
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5425 if (defined $mismatches){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5426 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5427 if ($mismatches == 0 or $mismatches == 1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5428 push @bowtie_options,"-N $mismatches";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5429 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5430 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5431 die "Please set the number of multiseed mismatches for Bowtie 2 with '-N <int>' (where <int> can be 0 or 1)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5432 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5433 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5434 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5435 if ($mismatches >= 0 and $mismatches <= 3){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5436 push @bowtie_options,"-n $mismatches";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5437 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5438 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5439 die "Please set the number of seed mismatches for Bowtie 1 with '-n <int>' (where <int> can be 0,1,2 or 3)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5440 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5441 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5442 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5443 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5444 unless ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5445 push @bowtie_options,"-n 1"; # setting -n to 1 by default (for use with Bowtie only) because it is much quicker than the default mode of -n 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5446 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5447 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5448
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5449 ### SEED LENGTH
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5450 if (defined $seed_length){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5451 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5452 push @bowtie_options,"-L $seed_length";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5453 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5454 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5455 push @bowtie_options,"-l $seed_length";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5456 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5457 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5458
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5459 ### MISMATCH CEILING
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5460 if (defined $ceiling){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5461 die "The option '-e' is not compatible with Bowtie 2. Please respecify options\n" if ($bowtie2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5462 push @bowtie_options,"-e $ceiling";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5463 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5464
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5465
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5466 ### BOWTIE 2 EFFORT OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5467
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5468 ### CONSECUTIVE SEED EXTENSION FAILS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5469 if (defined $seed_extension_fails){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5470 die "The option '-D <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5471 push @bowtie_options,"-D $seed_extension_fails";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5472 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5473
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5474 ### RE-SEEDING REPETITIVE SEEDS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5475 if (defined $reseed_repetitive_seeds){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5476 die "The option '-R <int>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5477 push @bowtie_options,"-R $reseed_repetitive_seeds";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5478 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5479
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5480
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5481 ### BOWTIE 2 SCORING OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5482 if ($score_min){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5483 die "The option '--score_min <func>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5484 unless ($score_min =~ /^L,.+,.+$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5485 die "The option '--score_min <func>' needs to be in the format <L,value,value> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5486 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5487 push @bowtie_options,"--score-min $score_min";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5488 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5489 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5490 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5491 push @bowtie_options,"--score-min L,0,-0.2"; # default setting, more stringent than normal Bowtie2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5492 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5493 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5494
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5495 ### BOWTIE 2 READ GAP OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5496 if ($rdg){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5497 die "The option '--rdg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5498 unless ($rdg =~ /^.+,.+$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5499 die "The option '--rdg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5500 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5501 push @bowtie_options,"--rdg $rdg";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5502 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5503
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5504 ### BOWTIE 2 REFERENCE GAP OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5505 if ($rfg){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5506 die "The option '--rfg <int1>,<int2>' is only available when using Bowtie 2\n\n" unless ($bowtie2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5507 unless ($rfg =~ /^.+,.+$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5508 die "The option '--rfg <int1>,<int2>' needs to be in the format <integer,integer> . Please consult \"setting up functions\" in the Bowtie 2 manual for further information\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5509 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5510 push @bowtie_options,"--rfg $rfg";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5511 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5512
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5513
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5514
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5515 ### BOWTIE 2 PARALLELIZATION OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5516 if (defined $parallel){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5517 die "The parallelization switch '-p' only works for Bowtie 2. Please respecify!" unless ($bowtie2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5518 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5519 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5520 if ($parallel){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5521 die "Please select a value for -p of 2 or more!\n" unless ($parallel > 1);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5522 push @bowtie_options,"-p $parallel";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5523 push @bowtie_options,'--reorder'; ## re-orders the bowtie 2 output so that it does match the input files. This is abolutely required for parallelization to work.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5524 print "Each Bowtie 2 instance is going to be run with $parallel threads. Please monitor performance closely and tune down if needed!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5525 sleep (2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5526 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5527 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5528
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5529 ### REPORTING OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5530
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5531 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5532 push @bowtie_options,'--ignore-quals'; ## All mismatches will receive penalty for mismatches as if they were of high quality, which is 6 by default
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5533
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5534 ### Option -M is deprecated since Bowtie 2 version 2.0.0 beta7. I'll leave this option commented out for a while
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5535 if(defined $most_valid_alignments){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5536
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5537 warn "\nThe option -M is now deprecated (as of Bowtie 2 version 2.0.0 beta7). What used to be called -M mode is still the default mode. Use the -D and -R options to adjust the effort expended to find valid alignments.\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5538 # push @bowtie_options,"-M $most_valid_alignments";sleep (5);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5539 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5540 # else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5541 # push @bowtie_options,'-M 10'; # the default behavior for Bowtie 2 is to report (and sort) up to 500 alignments for a given sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5542 # }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5543 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5544 else{ # Because of the way Bismark works we will always use the reporting option -k 2 (report up to 2 valid alignments) for Bowtie 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5545 push @bowtie_options,'-k 2';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5546 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5547
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5548 ### --BEST
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5549 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5550 if ($best){ # Bowtie 2 does away with the concept of --best, so one can also not select --no-best when Bowtie 2 is to be used
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5551 die "The option '--no-best' is not compatible with Bowtie 2. Please respecify options\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5552 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5553 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5554 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5555 # --best is the default option for Bowtie 1, specifying --no-best can turn it off (e.g. to speed up alignment process)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5556 unless ($best){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5557 push @bowtie_options,'--best';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5558 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5559 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5560
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5561 ### VANILLA BISMARK (BOWTIE 1) OUTPUT
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5562 if ($vanilla){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5563 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5564 die "The options --bowtie2 and the --vanilla are not compatible. Please respecify!\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5565 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5566 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5567 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5568 $vanilla = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5569 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5570
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5571 ### PAIRED-END MAPPING
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5572 if ($mates1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5573 my @mates1 = (split (/,/,$mates1));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5574 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n" unless ($mates2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5575 my @mates2 = (split(/,/,$mates2));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5576 unless (scalar @mates1 == scalar @mates2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5577 die "Paired-end mapping requires the same amounnt of mate1 and mate2 files, please respecify! (format: -1 <mates1> -2 <mates2>)\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5578 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5579 while (1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5580 my $mate1 = shift @mates1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5581 my $mate2 = shift @mates2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5582 last unless ($mate1 and $mate2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5583 push @filenames,"$mate1,$mate2";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5584 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5585 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5586 push @bowtie_options,'--no-mixed'; ## By default Bowtie 2 is not looking for single-end alignments if it can't find concordant or discordant alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5587 push @bowtie_options,'--no-discordant';## By default Bowtie 2 is not looking for discordant alignments if it can't find concordant ones
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5588 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5589 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5590 elsif ($mates2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5591 die "Paired-end mapping requires the format: -1 <mates1> -2 <mates2>, please respecify!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5592 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5593
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5594 ### SINGLE-END MAPPING
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5595 # Single-end mapping will be performed if no mate pairs for paired-end mapping have been specified
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5596 my $singles;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5597 unless ($mates1 and $mates2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5598 $singles = join (',',@ARGV);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5599 unless ($singles){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5600 die "\nNo filename supplied! Please specify one or more files for single-end Bismark mapping!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5601 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5602 $singles =~ s/\s/,/g;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5603 @filenames = (split(/,/,$singles));
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5604 warn "\nFiles to be analysed:\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5605 warn "@filenames\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5606 sleep (3);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5607 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5608
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5609 ### MININUM INSERT SIZE (PAIRED-END ONLY)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5610 if (defined $minins){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5611 die "-I/--minins can only be used for paired-end mapping!\n\n" if ($singles);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5612 push @bowtie_options,"--minins $minins";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5613 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5614
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5615 ### MAXIMUM INSERT SIZE (PAIRED-END ONLY)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5616 if (defined $maxins){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5617 die "-X/--maxins can only be used for paired-end mapping!\n\n" if ($singles);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5618 push @bowtie_options,"--maxins $maxins";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5619 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5620 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5621 unless ($singles){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5622 push @bowtie_options,'--maxins 500';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5623 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5624 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5625
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5626 ### QUIET prints nothing besides alignments (suppresses warnings)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5627 if ($quiet){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5628 push @bowtie_options,'--quiet';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5629 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5630
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5631 ### CHUNKMBS needed to be increased to avoid memory exhaustion warnings for Bowtie 1, particularly for --best (and paired-end) alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5632 unless ($bowtie2){ # Bowtie 2 does not have a chunkmbs option
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5633 if (defined $chunk){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5634 push @bowtie_options,"--chunkmbs $chunk";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5635 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5636 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5637 push @bowtie_options,'--chunkmbs 512'; ## setting the default to 512MB (up from 64 default)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5638 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5639 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5640
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5641
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5642 ### SUMMARY OF ALL BOWTIE OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5643 my $bowtie_options = join (' ',@bowtie_options);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5644
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5645
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5646 ### STRAND-SPECIFIC LIBRARIES
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5647 my $directional;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5648 if ($non_directional){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5649 print "Library was specified to be not strand-specific (non-directional), therefore alignments to all four possible bisulfite strands (OT, CTOT, OB and CTOB) will be reported.\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5650 sleep (3);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5651 $directional = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5652 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5653 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5654 print "Library is assumed to be strand-specific (directional), alignments to strands complementary to the original top or bottom strands will be ignored (i.e. not performed!).\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5655 sleep (3);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5656 $directional = 1; # Changed this to being the default behaviour
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5657 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5658
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5659 ### UNMAPPED SEQUENCE OUTPUT
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5660 $unmapped = 0 unless ($unmapped);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5661
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5662 ### AMBIGUOUS ALIGNMENT SEQUENCE OUTPUT
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5663 $multi_map = 0 unless ($multi_map);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5664
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5665
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5666 ### OUTPUT DIRECTORY
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5667
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5668 chdir $parent_dir or die "Failed to move back to current working directory\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5669 if ($output_dir){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5670 unless ($output_dir =~ /\/$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5671 $output_dir =~ s/$/\//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5672 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5673
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5674 if (chdir $output_dir){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5675 $output_dir = getcwd; # making the path absolute
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5676 unless ($output_dir =~ /\/$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5677 $output_dir =~ s/$/\//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5678 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5679 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5680 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5681 mkdir $output_dir or die "Unable to create directory $output_dir $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5682 warn "Created output directory $output_dir!\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5683 chdir $output_dir or die "Failed to move to $output_dir\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5684 $output_dir = getcwd; # making the path absolute
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5685 unless ($output_dir =~ /\/$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5686 $output_dir =~ s/$/\//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5687 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5688 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5689 warn "Output will be written into the directory: $output_dir\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5690 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5691 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5692 $output_dir = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5693 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5694
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5695 ### TEMPORARY DIRECTORY for C->T and G->A transcribed files
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5696
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5697 chdir $parent_dir or die "Failed to move back to current working directory\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5698 if ($temp_dir){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5699 warn "\nUsing temp directory: $temp_dir\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5700 unless ($temp_dir =~ /\/$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5701 $temp_dir =~ s/$/\//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5702 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5703
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5704 if (chdir $temp_dir){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5705 $temp_dir = getcwd; # making the path absolute
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5706 unless ($temp_dir =~ /\/$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5707 $temp_dir =~ s/$/\//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5708 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5709 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5710 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5711 mkdir $temp_dir or die "Unable to create directory $temp_dir $!\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5712 warn "Created temporary directory $temp_dir!\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5713 chdir $temp_dir or die "Failed to move to $temp_dir\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5714 $temp_dir = getcwd; # making the path absolute
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5715 unless ($temp_dir =~ /\/$/){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5716 $temp_dir =~ s/$/\//;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5717 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5718 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5719 warn "Temporary files will be written into the directory: $temp_dir\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5720 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5721 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5722 $temp_dir = '';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5723 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5724
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5725
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5726 return ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$sequence_format,$bowtie_options,$directional,$unmapped,$multi_map,$phred64,$solexa,$output_dir,$bowtie2,$vanilla,$sam_no_hd,$skip,$qupto,$temp_dir);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5727 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5728
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5729
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5730
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5731 sub generate_SAM_header{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5732 print OUT "\@HD\tVN:1.0\tSO:unsorted\n"; # @HD = header, VN = version, SO = sort order
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5733 foreach my $chr (keys %chromosomes){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5734 my $length = length ($chromosomes{$chr});
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5735 print OUT "\@SQ\tSN:$chr\tLN:$length\n"; # @SQ = sequence, SN = seq name, LN = length
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5736 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5737 print OUT "\@PG\tID:Bismark\tVN:$bismark_version\tCL:\"bismark $command_line\"\n"; # @PG = program, ID = unique identifier, PN = program name name, VN = program version
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5738 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5739
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5740 ### I would like to thank the following individuals for their valuable contributions to the Bismark SAM output format:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5741 ### O. Tam (Sep 2010), C. Whelan (2011), E. Vidal (2011), T. McBryan (2011), P. Hickey (2011)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5742
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5743 sub single_end_SAM_output{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5744 my ($id,$actual_seq,$methylation_call_params,$qual) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5745 my $strand = $methylation_call_params->{$id}->{alignment_strand};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5746 my $chr = $methylation_call_params->{$id}->{chromosome};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5747 my $start = $methylation_call_params->{$id}->{position};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5748 my $stop = $methylation_call_params->{$id}->{end_position};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5749 my $ref_seq = $methylation_call_params->{$id}->{unmodified_genomic_sequence};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5750 my $methcall = $methylation_call_params->{$id}->{methylation_call};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5751 my $read_conversion = $methylation_call_params->{$id}->{read_conversion};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5752 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5753 my $number_of_mismatches = $methylation_call_params->{$id}->{number_of_mismatches};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5754 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5755 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5756 ## Bit Description Comment Value
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5757 ## 0x1 template has multiple segments in sequencing 0: single-end 1: paired end value: 2**0 ( 1)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5758 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2**1 ( 2)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5759 ## 0x4 segment unmapped --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5760 ## 0x8 next segment in the template unmapped --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5761 ## 0x10 SEQ being reverse complemented value: 2**4 ( 16)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5762 ## 0x20 SEQ of the next segment in the template being reversed value: 2**5 ( 32)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5763 ## 0x40 the first segment in the template read 1 value: 2**6 ( 64)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5764 ## 0x80 the last segment in the template read 2 value: 2**7 (128)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5765 ## 0x100 secondary alignment --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5766 ## 0x200 not passing quality controls --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5767 ## 0x400 PCR or optical duplicate --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5768
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5769 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5770
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5771 my $flag; # FLAG variable used for SAM format.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5772 if ($strand eq "+"){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5773 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5774 $flag = 0; # 0 for "+" strand (OT)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5775 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5776 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5777 $flag = 16; # 16 for "-" strand (CTOB, yields information for the original bottom strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5778 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5779 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5780 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5781 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5782 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5783 elsif ($strand eq "-"){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5784 if ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5785 $flag = 16; # 16 for "-" strand (OB)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5786 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5787 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5788 $flag = 0; # 0 for "+" strand (CTOT, yields information for the original top strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5789 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5790 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5791 die "Unexpected strand and read/genome conversion: strand: $strand, read conversion: $read_conversion, genome_conversion: $genome_conversion\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5792 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5793 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5794 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5795 die "Unexpected strand information: $strand\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5796 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5797
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5798 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5799
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5800 my $mapq = 255; # Assume mapping quality is unavailable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5801
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5802 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5803
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5804 my $cigar;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5805 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5806 $cigar = $methylation_call_params->{$id}->{CIGAR}; # Actual CIGAR string reported by Bowtie 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5807 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5808 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5809 $cigar = length($actual_seq) . "M"; # Bowtie 1 output does not contain indels (only matches and mismatches)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5810 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5811
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5812 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5813
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5814 my $rnext = "*"; # Paired-end variable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5815
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5816 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5817
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5818 my $pnext = 0; # Paired-end variable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5819
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5820 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5821
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5822 my $tlen = 0; # Paired-end variable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5823
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5824 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5825
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5826 if ($read_conversion eq 'CT'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5827 $ref_seq = substr($ref_seq, 0, length($ref_seq) - 2); # Removes additional nucleotides from the 3' end. This only works for the original top or bottom strands
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5828 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5829 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5830 $ref_seq = substr($ref_seq, 2, length($ref_seq) - 2); # Removes additional nucleotides from the 5' end. This works for the complementary strands in non-directional libraries
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5831 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5832
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5833 if ($strand eq '-'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5834 $actual_seq = revcomp($actual_seq); # Sequence represented on the forward genomic strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5835 $ref_seq = revcomp($ref_seq); # Required for comparison with actual sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5836 $qual = reverse $qual; # if the sequence was reverse-complemented the quality string needs to be reversed as well
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5837 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5838
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5839 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5840
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5841 my $hemming_dist = hemming_dist($actual_seq,$ref_seq); # Edit distance to the reference, i.e. minimal number of one-nucleotide edits needed to transform the read string
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5842 # into the reference string. hemming_dist()
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5843 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5844 $hemming_dist += $methylation_call_params->{$id}->{indels}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5845 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5846
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5847 my $NM_tag = "NM:i:$hemming_dist"; # Optional tag NM: edit distance based on nucleotide differences
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5848
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5849 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5850
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5851 my $XX_tag = make_mismatch_string($actual_seq, $ref_seq); # Optional tag XX: string providing mismatched reference bases in the alignment (NO indel information!)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5852
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5853 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5854
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5855 my $XM_tag; # Optional tag XM: Methylation Call String
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5856 if ($strand eq '+'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5857 $XM_tag = "XM:Z:$methcall";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5858 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5859 elsif ($strand eq '-'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5860 $XM_tag = 'XM:Z:'.reverse $methcall; # if the sequence was reverse-complemented the methylation call string needs to be reversed as well
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5861 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5862
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5863 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5864
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5865 my $XR_tag = "XR:Z:$read_conversion"; # Optional tag XR: Read Conversion
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5866
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5867 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5868
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5869 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5870
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5871 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5872
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5873 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5874 print OUT join("\t",($id,$flag,$chr,$start,$mapq,$cigar,$rnext,$pnext,$tlen,$actual_seq,$qual,$NM_tag,$XX_tag,$XM_tag,$XR_tag,$XG_tag)),"\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5875 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5876
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5877
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5878 sub paired_end_SAM_output{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5879 my ($id,$actual_seq_1,$actual_seq_2,$methylation_call_params,$qual_1,$qual_2) = @_;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5880 my $strand_1 = $methylation_call_params->{$id}->{alignment_read_1}; # Bowtie 1 only reports the read 1 alignment strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5881 my $strand_2 = $methylation_call_params->{$id}->{alignment_read_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5882 my $chr = $methylation_call_params->{$id}->{chromosome};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5883 my $ref_seq_1 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5884 my $ref_seq_2 = $methylation_call_params->{$id}->{unmodified_genomic_sequence_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5885 my $methcall_1 = $methylation_call_params->{$id}->{methylation_call_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5886 my $methcall_2 = $methylation_call_params->{$id}->{methylation_call_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5887 my $read_conversion_1 = $methylation_call_params->{$id}->{read_conversion_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5888 my $read_conversion_2 = $methylation_call_params->{$id}->{read_conversion_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5889 my $genome_conversion = $methylation_call_params->{$id}->{genome_conversion};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5890 my $number_of_mismatches_1 = $methylation_call_params->{$id}->{number_of_mismatches_1}; # only needed for custom allele-specific output, not the default!
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5891 my $number_of_mismatches_2 = $methylation_call_params->{$id}->{number_of_mismatches_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5892
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5893 my $id_1 = $id.'/1';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5894 my $id_2 = $id.'/2';
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5895
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5896 # Allows all degenerate nucleotide sequences in reference genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5897 die "Reference sequence ($ref_seq_1) contains invalid nucleotides!\n" if $ref_seq_1 =~ /[^ACTGNRYMKSWBDHV]/i;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5898 die "Reference sequence ($ref_seq_2) contains invalid nucleotides!\n" if $ref_seq_2 =~ /[^ACTGNRYMKSWBDHV]/i;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5899
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5900 my $index; # used to store the srand origin of the alignment in a less convoluted way
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5901
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5902 if ($read_conversion_1 eq 'CT' and $genome_conversion eq 'CT'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5903 $index = 0; ## this is OT (original top strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5904 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5905 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'GA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5906 $index = 1; ## this is CTOB (complementary to OB)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5907 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5908 elsif ($read_conversion_1 eq 'GA' and $genome_conversion eq 'CT'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5909 $index = 2; ## this is CTOT (complementary to OT)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5910 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5911 elsif ($read_conversion_1 eq 'CT' and $genome_conversion eq 'GA'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5912 $index = 3; ## this is OB (original bottom)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5913 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5914 else {
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5915 die "Unexpected combination of read 1 and genome conversion: $read_conversion_1 / $genome_conversion\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5916 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5917
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5918 ### we need to remove 2 bp of the genomic sequence as we were extracting read + 2bp long fragments to make a methylation call at the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5919 ### first or last position.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5920
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5921 if ($index == 0 or $index == 3){ # OT or OB
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5922 $ref_seq_1 = substr($ref_seq_1,0,length($ref_seq_1)-2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5923 $ref_seq_2 = substr($ref_seq_2,2,length($ref_seq_2)-2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5924 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5925 else{ # CTOT or CTOB
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5926 $ref_seq_1 = substr($ref_seq_1,2,length($ref_seq_1)-2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5927 $ref_seq_2 = substr($ref_seq_2,0,length($ref_seq_2)-2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5928 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5929
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5930 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5931
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5932 my $start_read_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5933 my $start_read_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5934 # adjusting end positions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5935
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5936 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5937 $start_read_1 = $methylation_call_params->{$id}->{position_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5938 $start_read_2 = $methylation_call_params->{$id}->{position_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5939 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5940 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5941 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5942 $start_read_1 = $methylation_call_params->{$id}->{start_seq_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5943 $start_read_2 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_2) + 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5944 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5945 else{ # read 1 is on the - strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5946 $start_read_1 = $methylation_call_params->{$id}->{alignment_end} - length ($actual_seq_1) + 1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5947 $start_read_2 = $methylation_call_params->{$id}->{start_seq_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5948 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5949 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5950
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5951 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5952
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5953 my $end_read_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5954 my $end_read_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5955 # adjusting end positions
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5956
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5957 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5958 $end_read_1 = $methylation_call_params->{$id}->{end_position_1};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5959 $end_read_2 = $methylation_call_params->{$id}->{end_position_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5960 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5961 else{ # Bowtie 1 output. $strand_1 stores the alignment of Read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5962 if ($strand_1 eq '+'){ # Read 1 aligns to the + strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5963 $end_read_1 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_1)-1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5964 $end_read_2 = $methylation_call_params->{$id}->{alignment_end};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5965 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5966 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5967 $end_read_1 = $methylation_call_params->{$id}->{alignment_end};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5968 $end_read_2 = $methylation_call_params->{$id}->{start_seq_1} + length ($actual_seq_2)-1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5969 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5970 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5971
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5972 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5973
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5974 ### This is a description of the bitwise FLAG field which needs to be set for the SAM file taken from: "The SAM Format Specification (v1.4-r985), September 7, 2011"
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5975 ## FLAG: bitwise FLAG. Each bit is explained in the following table:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5976 ## Bit Description Comment Value
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5977 ## 0x1 template having multiple segments in sequencing 0: single-end 1: paired end value: 2^^0 ( 1)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5978 ## 0x2 each segment properly aligned according to the aligner true only for paired-end alignments value: 2^^1 ( 2)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5979 ## 0x4 segment unmapped --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5980 ## 0x8 next segment in the template unmapped --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5981 ## 0x10 SEQ being reverse complemented - strand alignment value: 2^^4 ( 16)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5982 ## 0x20 SEQ of the next segment in the template being reversed + strand alignment value: 2^^5 ( 32)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5983 ## 0x40 the first segment in the template read 1 value: 2^^6 ( 64)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5984 ## 0x80 the last segment in the template read 2 value: 2^^7 (128)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5985 ## 0x100 secondary alignment --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5986 ## 0x200 not passing quality controls --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5987 ## 0x400 PCR or optical duplicate --- ---
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5988
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5989 ### As the FLAG value do not consider that there might be 4 different bisulfite strands of DNA, we are trying to make FLAG tags which take the strand identity into account
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5990
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5991 # strands OT and CTOT will be treated as aligning to the top strand (both sequences are scored as aligning to the top strand)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5992 # strands OB and CTOB will be treated as aligning to the bottom strand (both sequences are scored as reverse complemented sequences)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5993
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5994 my $flag_1; # FLAG variable used for SAM format
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5995 my $flag_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5996
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5997 if ($index == 0){ # OT
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5998 $flag_1 = 67; # Read 1 is on the + strand (1+2+64) (Read 2 is technically reverse-complemented, but we do not score it)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
5999 $flag_2 = 131; # Read 2 is on - strand but informative for the OT (1+2+128)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6000 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6001 elsif ($index == 1){ # CTOB
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6002 $flag_1 = 115; # Read 1 is on the + strand, we score for OB (1+2+16+32+64)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6003 $flag_2 = 179; # Read 2 is on the - strand (1+2+16+32+128)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6004 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6005 elsif ($index == 2){ # CTOT
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6006 $flag_1 = 67; # Read 1 is on the - strand (CTOT) strand, but we score it for OT (1+2+64)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6007 $flag_2 = 131; # Read 2 is on the + strand, score it for OT (1+2+128)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6008 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6009 elsif ($index == 3){ # OB
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6010 $flag_1 = 115; # Read 1 is on the - strand, we score for OB (1+2+16+32+64)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6011 $flag_2 = 179; # Read 2 is on the + strand (1+2+16+32+128)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6012 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6013
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6014 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6015
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6016 my $mapq = 255; # Mapping quality is unavailable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6017
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6018 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6019
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6020 my $cigar_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6021 my $cigar_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6022
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6023 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6024 $cigar_1 = $methylation_call_params->{$id}->{CIGAR_1}; # Actual CIGAR string reported by Bowtie 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6025 $cigar_2 = $methylation_call_params->{$id}->{CIGAR_2};
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6026 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6027 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6028 $cigar_1 = length($actual_seq_1) . "M"; # Assume no indels for Bowtie 1 mapping (only matches and mismatches)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6029 $cigar_2 = length($actual_seq_2) . "M";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6030 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6031
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6032 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6033
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6034 my $rnext = '='; # Chromosome of mate; applies to both reads
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6035
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6036 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6037
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6038 my $pnext_1 = $start_read_2; # Leftmost position of mate
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6039 my $pnext_2 = $start_read_1;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6040
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6041 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6042
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6043 my $tlen_1; # signed observed Template LENgth (or inferred fragment size)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6044 my $tlen_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6045
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6046 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6047
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6048 if ($start_read_1 <= $start_read_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6049
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6050 # Read 1 alignment is leftmost
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6051
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6052 if ($end_read_2 >= $end_read_1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6053
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6054 # -------------------------> read 1 reads overlapping
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6055 # <------------------------- read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6056 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6057 # or
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6058 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6059 # -------------------------> read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6060 # <----------------------- read 2 read 2 contained within read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6061 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6062 # or
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6063 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6064 # -------------------------> read 1 reads 1 and 2 exactly overlapping
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6065 # <------------------------- read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6066 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6067
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6068 # dovetailing of reads is not enabled for Bowtie 2 alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6069
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6070 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6071 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6072 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6073 elsif ($end_read_2 < $end_read_1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6074
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6075 # -------------------------> read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6076 # <----------- read 2 read 2 contained within read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6077 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6078 # or
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6079 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6080 # -------------------------> read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6081 # <----------- read 2 read 2 contained within read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6082
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6083 # start and end of read 2 are fully contained within read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6084 $tlen_1 = 0; # Set as 0 when the information is unavailable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6085 $tlen_2 = 0; # Set as 0 when the information is unavailable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6086 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6087
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6088 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6089
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6090 elsif ($start_read_2 < $start_read_1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6091
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6092 if ($end_read_1 >= $end_read_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6093
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6094 # Read 2 alignment is leftmost
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6095
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6096 # -------------------------> read 2 reads overlapping
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6097 # <------------------------- read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6098 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6099 # or
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6100 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6101 # -------------------------> read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6102 # <----------------------- read 1 read 1 contained within read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6103 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6104 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6105
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6106 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6107 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6108 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6109 elsif ($end_read_1 < $end_read_2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6110
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6111 # -------------------------> read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6112 # <----------- read 1 read 1 contained within read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6113 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6114 # or
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6115 #
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6116 # -------------------------> read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6117 # <----------- read 1 read 1 contained within read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6118
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6119 # start and end of read 1 are fully contained within read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6120 $tlen_1 = 0; # Set as 0 when the information is unavailable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6121 $tlen_2 = 0; # Set as 0 when the information is unavailable
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6122 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6123 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6124 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6125
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6126 else{ # Bowtie 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6127
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6128 if ($end_read_2 >= $end_read_1){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6129 # Read 1 alignment is leftmost
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6130 # -------------------------> read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6131 # <------------------------- read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6132 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6133
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6134 $tlen_1 = $end_read_2 - $start_read_1 + 1; # Leftmost read has a + sign,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6135 $tlen_2 = $start_read_1 - $end_read_2 - 1; # Rightmost read has a - sign
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6136 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6137 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6138 # Read 2 alignment is leftmost
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6139 # -------------------------> read 2
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6140 # <------------------------- read 1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6141 # this is the most extreme case for Bowtie 1 alignments, reads do not contain each other, also no dovetailing
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6142
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6143 $tlen_2 = $end_read_1 - $start_read_2 + 1; # Leftmost read has a + sign,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6144 $tlen_1 = $start_read_2 - $end_read_1 - 1; # Rightmost read has a - sign
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6145 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6146 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6147
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6148 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6149
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6150 # adjusting the strand of the sequence before we use them to generate mismatch strings
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6151 if ($strand_1 eq '-'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6152 $actual_seq_1 = revcomp($actual_seq_1); # Sequence represented on the forward genomic strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6153 $ref_seq_1 = revcomp($ref_seq_1); # Required for comparison with actual sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6154 $qual_1 = reverse $qual_1; # we need to reverse the quality string as well
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6155 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6156 if ($strand_2 eq '-'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6157 $actual_seq_2 = revcomp($actual_seq_2); # Mate sequence represented on the forward genomic strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6158 $ref_seq_2 = revcomp($ref_seq_2); # Required for comparison with actual sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6159 $qual_2 = reverse $qual_2; # If the sequence gets reverse complemented we reverse the quality string as well
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6160 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6161
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6162 # print "$actual_seq_1\n$ref_seq_1\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6163 # print "$actual_seq_2\n$ref_seq_2\n\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6164
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6165 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6166
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6167 my $hemming_dist_1 = hemming_dist($actual_seq_1,$ref_seq_1); # Minimal number of one-nucleotide edits needed to transform the read string into the reference sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6168 my $hemming_dist_2 = hemming_dist($actual_seq_2,$ref_seq_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6169 if ($bowtie2){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6170 $hemming_dist_1 += $methylation_call_params->{$id}->{indels_1}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6171 $hemming_dist_2 += $methylation_call_params->{$id}->{indels_2}; # Adding the number of inserted/deleted bases which we parsed while getting the genomic sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6172 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6173 my $NM_tag_1 = "NM:i:$hemming_dist_1"; # Optional tag NM: edit distance based on nucleotide differences
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6174 my $NM_tag_2 = "NM:i:$hemming_dist_2"; # Optional tag NM: edit distance based on nucleotide differences
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6175
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6176 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6177
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6178 my $XX_tag_1 = make_mismatch_string($actual_seq_1,$ref_seq_1); # Optional tag XX: String providing mismatched reference bases in the alignment (NO indel information!)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6179 my $XX_tag_2 = make_mismatch_string($actual_seq_2,$ref_seq_2);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6180
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6181 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6182
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6183 my $XM_tag_1; # Optional tag XM: Methylation call string
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6184 my $XM_tag_2;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6185
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6186 if ($strand_1 eq '-'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6187 $XM_tag_1 = 'XM:Z:'.reverse $methcall_1; # Needs to be reversed if the sequence was reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6188 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6189 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6190 $XM_tag_1 = "XM:Z:$methcall_1";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6191 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6192
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6193 if ($strand_2 eq '-'){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6194 $XM_tag_2 = 'XM:Z:'.reverse $methcall_2; # Needs to be reversed if the sequence was reverse complemented
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6195 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6196 else{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6197 $XM_tag_2 = "XM:Z:$methcall_2";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6198 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6199
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6200 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6201
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6202 my $XR_tag_1 = "XR:Z:$read_conversion_1"; # Optional tag XR: Read 1 conversion state
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6203 my $XR_tag_2 = "XR:Z:$read_conversion_2"; # Optional tag XR: Read 2 conversion state
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6204
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6205 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6206
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6207 my $XG_tag = "XG:Z:$genome_conversion"; # Optional tag XG: Genome Conversion state; valid for both reads
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6208
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6209 #####
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6210
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6211 # SAM format: QNAME, FLAG, RNAME, 1-based POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, optional fields
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6212 print OUT join("\t", ($id_1, $flag_1, $chr, $start_read_1, $mapq, $cigar_1, $rnext, $pnext_1, $tlen_1, $actual_seq_1, $qual_1, $NM_tag_1, $XX_tag_1, $XM_tag_1,$XR_tag_1,$XG_tag)), "\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6213 print OUT join("\t", ($id_2, $flag_2, $chr, $start_read_2, $mapq, $cigar_2, $rnext, $pnext_2, $tlen_2, $actual_seq_2, $qual_2, $NM_tag_2, $XX_tag_2, $XM_tag_2,$XR_tag_2,$XG_tag)), "\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6214 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6215
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6216 sub revcomp{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6217 my $seq = shift or die "Missing seq to reverse complement\n";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6218 $seq = reverse $seq;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6219 $seq =~ tr/ACTGactg/TGACTGAC/;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6220 return $seq;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6221 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6222
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6223 sub hemming_dist{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6224 my $matches = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6225 my @actual_seq = split //,(shift @_);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6226 my @ref_seq = split //,(shift @_);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6227 foreach (0..$#actual_seq){
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6228 ++$matches if ($actual_seq[$_] eq $ref_seq[$_]);
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6229 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6230 return my $hd = scalar @actual_seq - $matches;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6231 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6232
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6233 sub make_mismatch_string{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6234 my $actual_seq = shift or die "Missing actual sequence";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6235 my $ref_seq = shift or die "Missing reference sequence";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6236 my $XX_tag = "XX:Z:";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6237 my $tmp = ($actual_seq ^ $ref_seq); # Bitwise comparison
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6238 my $prev_mm_pos = 0;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6239 while($tmp =~ /[^\0]/g){ # Where bitwise comparison showed a difference
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6240 my $nuc_match = pos($tmp) - $prev_mm_pos - 1; # Generate number of nucleotide that matches since last mismatch
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6241 my $nuc_mm = substr($ref_seq, pos($tmp) - 1, 1) if pos($tmp) <= length($ref_seq); # Obtain reference nucleotide that was different from the actual read
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6242 $XX_tag .= "$nuc_match" if $nuc_match > 0; # Ignore if mismatches are adjacent to each other
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6243 $XX_tag .= "$nuc_mm" if defined $nuc_mm; # Ignore if there is no mismatch (prevents uninitialized string concatenation)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6244 $prev_mm_pos = pos($tmp); # Position of last mismatch
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6245 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6246 my $end_matches = length($ref_seq) - $prev_mm_pos; # Provides number of matches from last mismatch till end of sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6247 $XX_tag .= "$end_matches" if $end_matches > 0; # Ignore if mismatch is at the end of sequence
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6248 return $XX_tag;
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6249 }
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6250
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6251
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6252
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6253 sub print_helpfile{
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6254 print << "HOW_TO";
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6255
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6256
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6257 This program is free software: you can redistribute it and/or modify
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6258 it under the terms of the GNU General Public License as published by
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6259 the Free Software Foundation, either version 3 of the License, or
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6260 (at your option) any later version.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6261
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6262 This program is distributed in the hope that it will be useful,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6263 but WITHOUT ANY WARRANTY; without even the implied warranty of
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6264 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6265 GNU General Public License for more details.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6266 You should have received a copy of the GNU General Public License
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6267 along with this program. If not, see <http://www.gnu.org/licenses/>.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6268
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6269
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6270
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6271 DESCRIPTION
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6272
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6273
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6274 The following is a brief description of command line options and arguments to control the Bismark
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6275 bisulfite mapper and methylation caller. Bismark takes in FastA or FastQ files and aligns the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6276 reads to a specified bisulfite genome. Sequence reads are transformed into a bisulfite converted forward strand
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6277 version (C->T conversion) or into a bisulfite treated reverse strand (G->A conversion of the forward strand).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6278 Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6279 (C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6280 forward strand, by doing this alignments will produce the same positions). These 4 instances of Bowtie (1 or 2)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6281 are run in parallel. The sequence file(s) are then read in again sequence by sequence to pull out the original
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6282 sequence from the genome and determine if there were any protected C's present or not.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6283
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6284 As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6285 re-enabled by using --non_directional.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6286
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6287 The final output of Bismark is in SAM format by default. For Bowtie 1 one can alos choose to report the old
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6288 'vanilla' output format, which is a single tab delimited file with all sequences that have a unique best
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6289 alignment to any of the 4 possible strands of a bisulfite PCR product. Both formats are described in more detail below.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6290
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6291
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6292 USAGE: bismark [options] <genome_folder> {-1 <mates1> -2 <mates2> | <singles>}
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6293
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6294
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6295 ARGUMENTS:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6296
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6297 <genome_folder> The path to the folder containing the unmodified reference genome
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6298 as well as the subfolders created by the Bismark_Genome_Preparation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6299 script (/Bisulfite_Genome/CT_conversion/ and /Bisulfite_Genome/GA_conversion/).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6300 Bismark expects one or more fastA files in this folder (file extension: .fa
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6301 or .fasta). The path can be relative or absolute.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6302
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6303 -1 <mates1> Comma-separated list of files containing the #1 mates (filename usually includes
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6304 "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6305 correspond file-for-file and read-for-read with those specified in <mates2>.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6306 Reads may be a mix of different lengths. Bismark will produce one mapping result
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6307 and one report file per paired-end input file pair.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6308
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6309 -2 <mates2> Comma-separated list of files containing the #2 mates (filename usually includes
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6310 "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6311 correspond file-for-file and read-for-read with those specified in <mates1>.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6312 Reads may be a mix of different lengths.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6313
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6314 <singles> A comma- or space-separated list of files containing the reads to be aligned (e.g.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6315 lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6316 produce one mapping result and one report file per input file.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6317
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6318
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6319 OPTIONS:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6320
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6321
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6322 Input:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6323
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6324 -q/--fastq The query input files (specified as <mate1>,<mate2> or <singles> are FASTQ
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6325 files (usually having extension .fg or .fastq). This is the default. See also
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6326 --solexa-quals.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6327
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6328 -f/--fasta The query input files (specified as <mate1>,<mate2> or <singles> are FASTA
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6329 files (usually havin extension .fa, .mfa, .fna or similar). All quality values
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6330 are assumed to be 40 on the Phred scale.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6331
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6332 -s/--skip <int> Skip (i.e. do not align) the first <int> reads or read pairs from the input.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6333
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6334 -u/--upto <int> Only aligns the first <int> reads or read pairs from the input. Default: no limit.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6335
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6336 --phred33-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 33. Default: on.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6337
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6338 --phred64-quals FASTQ qualities are ASCII chars equal to the Phred quality plus 64. Default: off.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6339
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6340 --solexa-quals Convert FASTQ qualities from solexa-scaled (which can be negative) to phred-scaled
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6341 (which can't). The formula for conversion is:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6342 phred-qual = 10 * log(1 + 10 ** (solexa-qual/10.0)) / log(10). Used with -q. This
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6343 is usually the right option for use with (unconverted) reads emitted by the GA
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6344 Pipeline versions prior to 1.3. Works only for Bowtie 1. Default: off.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6345
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6346 --solexa1.3-quals Same as --phred64-quals. This is usually the right option for use with (unconverted)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6347 reads emitted by GA Pipeline version 1.3 or later. Default: off.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6348
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6349 --path_to_bowtie The full path </../../> to the Bowtie (1 or 2) installation on your system. If not
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6350 specified it is assumed that Bowtie (1 or 2) is in the PATH.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6351
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6352
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6353 Alignment:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6354
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6355 -n/--seedmms <int> The maximum number of mismatches permitted in the "seed", i.e. the first L base pairs
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6356 of the read (where L is set with -l/--seedlen). This may be 0, 1, 2 or 3 and the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6357 default is 1. This option is only available for Bowtie 1 (for Bowtie 2 see -N).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6358
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6359 -l/--seedlen The "seed length"; i.e., the number of bases of the high quality end of the read to
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6360 which the -n ceiling applies. The default is 28. Bowtie (and thus Bismark) is faster for
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6361 larger values of -l. This option is only available for Bowtie 1 (for Bowtie 2 see -L).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6362
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6363 -e/--maqerr <int> Maximum permitted total of quality values at all mismatched read positions throughout
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6364 the entire alignment, not just in the "seed". The default is 70. Like Maq, bowtie rounds
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6365 quality values to the nearest 10 and saturates at 30. This value is not relevant for
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6366 Bowtie 2.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6367
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6368 --chunkmbs <int> The number of megabytes of memory a given thread is given to store path descriptors in
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6369 --best mode. Best-first search must keep track of many paths at once to ensure it is
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6370 always extending the path with the lowest cumulative cost. Bowtie tries to minimize the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6371 memory impact of the descriptors, but they can still grow very large in some cases. If
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6372 you receive an error message saying that chunk memory has been exhausted in --best mode,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6373 try adjusting this parameter up to dedicate more memory to the descriptors. This value
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6374 is not relevant for Bowtie 2. Default: 512.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6375
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6376 -I/--minins <int> The minimum insert size for valid paired-end alignments. E.g. if -I 60 is specified and
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6377 a paired-end alignment consists of two 20-bp alignments in the appropriate orientation
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6378 with a 20-bp gap between them, that alignment is considered valid (as long as -X is also
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6379 satisfied). A 19-bp gap would not be valid in that case. Default: 0.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6380
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6381 -X/--maxins <int> The maximum insert size for valid paired-end alignments. E.g. if -X 100 is specified and
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6382 a paired-end alignment consists of two 20-bp alignments in the proper orientation with a
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6383 60-bp gap between them, that alignment is considered valid (as long as -I is also satisfied).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6384 A 61-bp gap would not be valid in that case. Default: 500.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6385
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6386
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6387 Bowtie 1 Reporting:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6388
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6389 -k <2> Due to the way Bismark works Bowtie will report up to 2 valid alignments. This option
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6390 will be used by default.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6391
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6392 --best Make Bowtie guarantee that reported singleton alignments are "best" in terms of stratum
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6393 (i.e. number of mismatches, or mismatches in the seed in the case if -n mode) and in
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6394 terms of the quality; e.g. a 1-mismatch alignment where the mismatch position has Phred
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6395 quality 40 is preferred over a 2-mismatch alignment where the mismatched positions both
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6396 have Phred quality 10. When --best is not specified, Bowtie may report alignments that
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6397 are sub-optimal in terms of stratum and/or quality (though an effort is made to report
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6398 the best alignment). --best mode also removes all strand bias. Note that --best does not
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6399 affect which alignments are considered "valid" by Bowtie, only which valid alignments
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6400 are reported by Bowtie. Bowtie is about 1-2.5 times slower when --best is specified.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6401 Default: on.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6402
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6403 --no_best Disables the --best option which is on by default. This can speed up the alignment process,
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6404 e.g. for testing purposes, but for credible results it is not recommended to disable --best.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6405
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6406
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6407 Output:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6408
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6409 --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6410 bisulfite strands will be reported. Default: OFF.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6411
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6412 (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6413 to the original strands are merely theoretical and should not exist in reality. Specifying directional
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6414 alignments (which is the default) will only run 2 alignment threads to the original top (OT)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6415 or bottom (OB) strands in parallel and report these alignments. This is the recommended option
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6416 for sprand-specific libraries).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6417
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6418 --sam-no-hd Suppress SAM header lines (starting with @). This might be useful when very large input files are
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6419 split up into several smaller files to run concurrently and the output files are to be merged.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6420
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6421 --quiet Print nothing besides alignments.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6422
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6423 --vanilla Performs bisulfite mapping with Bowtie 1 and prints the 'old' output (as in Bismark 0.5.X) instead
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6424 of SAM format output.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6425
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6426 -un/--unmapped Write all reads that could not be aligned to a file in the output directory. Written reads will
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6427 appear as they did in the input, without any translation of quality values that may have
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6428 taken place within Bowtie or Bismark. Paired-end reads will be written to two parallel files with _1
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6429 and _2 inserted in their filenames, i.e. _unmapped_reads_1.txt and unmapped_reads_2.txt. Reads
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6430 with more than one valid alignment with the same number of lowest mismatches (ambiguous mapping)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6431 are also written to _unmapped_reads.txt unless the option --ambiguous is specified as well.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6432
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6433 --ambiguous Write all reads which produce more than one valid alignment with the same number of lowest
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6434 mismatches or other reads that fail to align uniquely to a file in the output directory.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6435 Written reads will appear as they did in the input, without any of the translation of quality
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6436 values that may have taken place within Bowtie or Bismark. Paired-end reads will be written to two
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6437 parallel files with _1 and _2 inserted in theit filenames, i.e. _ambiguous_reads_1.txt and
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6438 _ambiguous_reads_2.txt. These reads are not written to the file specified with --un.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6439
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6440 -o/--output_dir <dir> Write all output files into this directory. By default the output files will be written into
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6441 the same folder as the input file(s). If the specified folder does not exist, Bismark will attempt
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6442 to create it first. The path to the output folder can be either relative or absolute.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6443
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6444 --temp_dir <dir> Write temporary files to this directory instead of into the same directory as the input files. If
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6445 the specified folder does not exist, Bismark will attempt to create it first. The path to the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6446 temporary folder can be either relative or absolute.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6447
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6448
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6449
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6450 Other:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6451
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6452 -h/--help Displays this help file.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6453
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6454 -v/--version Displays version information.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6455
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6456
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6457 BOWTIE 2 SPECIFIC OPTIONS
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6458
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6459 --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6460 alignments, i.e. searches for alignments involving all read characters (also called
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6461 untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6462 and/or quality trimmed where appropriate. Default: off.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6463
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6464 Bowtie 2 alignment options:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6465
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6466 -N <int> Sets the number of mismatches to allowed in a seed alignment during multiseed alignment.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6467 Can be set to 0 or 1. Setting this higher makes alignment slower (often much slower)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6468 but increases sensitivity. Default: 0. This option is only available for Bowtie 2 (for
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6469 Bowtie 1 see -n).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6470
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6471 -L <int> Sets the length of the seed substrings to align during multiseed alignment. Smaller values
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6472 make alignment slower but more senstive. Default: the --sensitive preset of Bowtie 2 is
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6473 used by default, which sets -L to 20. This option is only available for Bowtie 2 (for
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6474 Bowtie 1 see -l).
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6475
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6476 --ignore-quals When calculating a mismatch penalty, always consider the quality value at the mismatched
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6477 position to be the highest possible, regardless of the actual value. I.e. input is treated
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6478 as though all quality values are high. This is also the default behavior when the input
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6479 doesn't specify quality values (e.g. in -f mode). This option is invariable and on by default.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6480
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6481
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6482 Bowtie 2 paired-end options:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6483
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6484 --no-mixed This option disables Bowtie 2's behavior to try to find alignments for the individual mates if
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6485 it cannot find a concordant or discordant alignment for a pair. This option is invariable and
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6486 and on by default.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6487
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6488 --no-discordant Normally, Bowtie 2 looks for discordant alignments if it cannot find any concordant alignments.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6489 A discordant alignment is an alignment where both mates align uniquely, but that does not
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6490 satisfy the paired-end constraints (--fr/--rf/--ff, -I, -X). This option disables that behavior
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6491 and it is on by default.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6492
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6493
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6494 Bowtie 2 effort options:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6495
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6496 -D <int> Up to <int> consecutive seed extension attempts can "fail" before Bowtie 2 moves on, using
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6497 the alignments found so far. A seed extension "fails" if it does not yield a new best or a
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6498 new second-best alignment. Default: 15.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6499
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6500 -R <int> <int> is the maximum number of times Bowtie 2 will "re-seed" reads with repetitive seeds.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6501 When "re-seeding," Bowtie 2 simply chooses a new set of reads (same length, same number of
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6502 mismatches allowed) at different offsets and searches for more alignments. A read is considered
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6503 to have repetitive seeds if the total number of seed hits divided by the number of seeds
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6504 that aligned at least once is greater than 300. Default: 2.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6505
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6506 Bowtie 2 parallelization options:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6507
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6508
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6509 -p NTHREADS Launch NTHREADS parallel search threads (default: 1). Threads will run on separate processors/cores
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6510 and synchronize when parsing reads and outputting alignments. Searching for alignments is highly
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6511 parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6512 E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6513 by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6514 library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6515 automatically use the option '--reorder', which guarantees that output SAM records are printed in
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6516 an order corresponding to the order of the reads in the original input file, even when -p is set
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6517 greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6518 setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6519 if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6520 correspond to input order in that case.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6521
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6522 Bowtie 2 Scoring options:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6523
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6524 --score_min <func> Sets a function governing the minimum alignment score needed for an alignment to be considered
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6525 "valid" (i.e. good enough to report). This is a function of read length. For instance, specifying
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6526 L,0,-0.2 sets the minimum-score function f to f(x) = 0 + -0.2 * x, where x is the read length.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6527 See also: setting function options at http://bowtie-bio.sourceforge.net/bowtie2. The default is
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6528 L,0,-0.2.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6529
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6530 --rdg <int1>,<int2> Sets the read gap open (<int1>) and extend (<int2>) penalties. A read gap of length N gets a penalty
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6531 of <int1> + N * <int2>. Default: 5, 3.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6532
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6533 --rfg <int1>,<int2> Sets the reference gap open (<int1>) and extend (<int2>) penalties. A reference gap of length N gets
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6534 a penalty of <int1> + N * <int2>. Default: 5, 3.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6535
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6536
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6537 Bowtie 2 Reporting options:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6538
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6539 -most_valid_alignments <int> This used to be the Bowtie 2 parameter -M. As of Bowtie 2 version 2.0.0 beta7 the option -M is
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6540 deprecated. It will be removed in subsequent versions. What used to be called -M mode is still the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6541 default mode, but adjusting the -M setting is deprecated. Use the -D and -R options to adjust the
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6542 effort expended to find valid alignments.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6543
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6544 For reference, this used to be the old (now deprecated) description of -M:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6545 Bowtie 2 searches for at most <int>+1 distinct, valid alignments for each read. The search terminates when it
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6546 can't find more distinct valid alignments, or when it finds <int>+1 distinct alignments, whichever
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6547 happens first. Only the best alignment is reported. Information from the other alignments is used to
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6548 estimate mapping quality and to set SAM optional fields, such as AS:i and XS:i. Increasing -M makes
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6549 Bowtie 2 slower, but increases the likelihood that it will pick the correct alignment for a read that
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6550 aligns many places. For reads that have more than <int>+1 distinct, valid alignments, Bowtie 2 does not
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6551 guarantee that the alignment reported is the best possible in terms of alignment score. -M is
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6552 always used and its default value is set to 10.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6553
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6554
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6555 'VANILLA' Bismark OUTPUT:
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6556
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6557 Single-end output format (tab-separated):
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6558
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6559 (1) <seq-ID>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6560 (2) <read alignment strand>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6561 (3) <chromosome>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6562 (4) <start position>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6563 (5) <end position>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6564 (6) <observed bisulfite sequence>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6565 (7) <equivalent genomic sequence>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6566 (8) <methylation call>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6567 (9) <read conversion
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6568 (10) <genome conversion>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6569 (11) <read quality score (Phred33)>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6570
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6571
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6572 Paired-end output format (tab-separated):
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6573 (1) <seq-ID>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6574 (2) <read 1 alignment strand>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6575 (3) <chromosome>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6576 (4) <start position>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6577 (5) <end position>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6578 (6) <observed bisulfite sequence 1>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6579 (7) <equivalent genomic sequence 1>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6580 (8) <methylation call 1>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6581 (9) <observed bisulfite sequence 2>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6582 (10) <equivalent genomic sequence 2>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6583 (11) <methylation call 2>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6584 (12) <read 1 conversion
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6585 (13) <genome conversion>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6586 (14) <read 1 quality score (Phred33)>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6587 (15) <read 2 quality score (Phred33)>
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6588
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6589
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6590 Bismark SAM OUTPUT (default):
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6591
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6592 (1) QNAME (seq-ID)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6593 (2) FLAG (this flag tries to take the strand a bisulfite read originated from into account (this is different from ordinary DNA alignment flags!))
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6594 (3) RNAME (chromosome)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6595 (4) POS (start position)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6596 (5) MAPQ (always 255)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6597 (6) CIGAR
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6598 (7) RNEXT
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6599 (8) PNEXT
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6600 (9) TLEN
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6601 (10) SEQ
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6602 (11) QUAL (Phred33 scale)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6603 (12) NM-tag (edit distance to the reference)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6604 (13) XX-tag (base-by-base mismatches to the reference. This does not include indels)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6605 (14) XM-tag (methylation call string)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6606 (15) XR-tag (read conversion state for the alignment)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6607 (16) XG-tag (genome conversion state for the alignment)
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6608
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6609 Each read of paired-end alignments is written out in a separate line in the above format.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6610
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6611
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6612 This script was last edited on 21 Aug 2012.
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6613
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6614 HOW_TO
36d124f44c0a inital commit
bjoern-gruening
parents:
diff changeset
6615 }